* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-03 20:25 ` [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting Adrián Larumbe
@ 2024-09-04 7:49 ` Boris Brezillon
2024-09-12 15:03 ` Adrián Larumbe
2024-09-04 16:10 ` kernel test robot
` (2 subsequent siblings)
3 siblings, 1 reply; 15+ messages in thread
From: Boris Brezillon @ 2024-09-04 7:49 UTC (permalink / raw)
To: Adrián Larumbe
Cc: Steven Price, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, David Airlie, Daniel Vetter, Sumit Semwal,
Christian König, kernel, dri-devel, linux-kernel,
linux-media, linaro-mm-sig
On Tue, 3 Sep 2024 21:25:35 +0100
Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before an after
> a user CS.
>
> A separate kernel BO is created per queue to store those values. Jobs can
> access their sampled data through a slots buffer-specific index different
> from that of the queue's ringbuffer. The reason for this is saving memory
> on the profiling information kernel BO, since the amount of simultaneous
> profiled jobs we can write into the queue's ringbuffer might be much
> smaller than for regular jobs, as the former take more CSF instructions.
>
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
>
> A profile mode mask has been added that will in a future commit allow UM to
> toggle performance metric sampling behaviour, which is disabled by default
> to save power. When a ringbuffer CS is constructed, timestamp and cycling
> sampling instructions are added depending on the enabled flags in the
> profiling mask.
>
> A helper was provided that calculates the number of instructions for a
> given set of enablement mask, and these are passed as the number of credits
> when initialising a DRM scheduler job.
>
> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> ---
> drivers/gpu/drm/panthor/panthor_device.h | 22 ++
> drivers/gpu/drm/panthor/panthor_sched.c | 327 ++++++++++++++++++++---
> 2 files changed, 305 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index e388c0472ba7..a48e30d0af30 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -66,6 +66,25 @@ struct panthor_irq {
> atomic_t suspended;
> };
>
> +/**
> + * enum panthor_device_profiling_mode - Profiling state
> + */
> +enum panthor_device_profiling_flags {
> + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> + PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> +
> + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> +
> + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> +
> + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> + PANTHOR_DEVICE_PROFILING_ALL =
> + PANTHOR_DEVICE_PROFILING_CYCLES |
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> +};
> +
> /**
> * struct panthor_device - Panthor device
> */
> @@ -162,6 +181,9 @@ struct panthor_device {
> */
> struct page *dummy_latest_flush;
> } pm;
> +
> + /** @profile_mask: User-set profiling flags for job accounting. */
> + u32 profile_mask;
> };
>
> /**
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index c426a392b081..b087648bf59a 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
> #define MIN_CSGS 3
> #define MAX_CSG_PRIO 0xf
>
> +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
> +#define MAX_INSTRS_PER_JOB 32
> +
> struct panthor_group;
>
> /**
> @@ -476,6 +479,18 @@ struct panthor_queue {
> */
> struct list_head in_flight_jobs;
> } fence_ctx;
> +
> + /** @profiling_info: Job profiling data slots and access information. */
> + struct {
> + /** @slots: Kernel BO holding the slots. */
> + struct panthor_kernel_bo *slots;
> +
> + /** @slot_count: Number of jobs ringbuffer can hold at once. */
> + u32 slot_count;
> +
> + /** @profiling_seqno: Index of the next available profiling information slot. */
> + u32 profiling_seqno;
Nit: no need to repeat profiling as it's under the profiling_info
struct. I would simply name that one 'seqno'.
> + } profiling_info;
s/profiling_info/profiling/ ?
> };
>
> /**
> @@ -661,6 +676,18 @@ struct panthor_group {
> struct list_head wait_node;
> };
>
> +struct panthor_job_profiling_data {
> + struct {
> + u64 before;
> + u64 after;
> + } cycles;
> +
> + struct {
> + u64 before;
> + u64 after;
> + } time;
> +};
> +
> /**
> * group_queue_work() - Queue a group work
> * @group: Group to queue the work for.
> @@ -774,6 +801,12 @@ struct panthor_job {
>
> /** @done_fence: Fence signaled when the job is finished or cancelled. */
> struct dma_fence *done_fence;
> +
> + /** @profile_mask: Current device job profiling enablement bitmask. */
> + u32 profile_mask;
> +
> + /** @profile_slot: Job profiling information index in the profiling slots BO. */
> + u32 profiling_slot;
Nit: we tend to group fields together under sub-structs, so I'd say:
struct {
u32 mask; // or flags
u32 slot;
} profiling;
> };
>
> static void
> @@ -838,6 +871,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>
> panthor_kernel_bo_destroy(queue->ringbuf);
> panthor_kernel_bo_destroy(queue->iface.mem);
> + panthor_kernel_bo_destroy(queue->profiling_info.slots);
>
> /* Release the last_fence we were holding, if any. */
> dma_fence_put(queue->fence_ctx.last_fence);
> @@ -1982,8 +2016,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> }
> }
>
> -#define NUM_INSTRS_PER_SLOT 16
> -
> static void
> group_term_post_processing(struct panthor_group *group)
> {
> @@ -2815,65 +2847,211 @@ static void group_sync_upd_work(struct work_struct *work)
> group_put(group);
> }
>
> -static struct dma_fence *
> -queue_run_job(struct drm_sched_job *sched_job)
> +struct panthor_job_ringbuf_instrs {
> + u64 buffer[MAX_INSTRS_PER_JOB];
> + u32 count;
> +};
> +
> +struct panthor_job_instr {
> + u32 profile_mask;
> + u64 instr;
> +};
> +
> +#define JOB_INSTR(__prof, __instr) \
> + { \
> + .profile_mask = __prof, \
> + .instr = __instr, \
> + }
> +
> +static void
> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> + struct panthor_job *job,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> + u32 start = job->ringbuf.start & (ringbuf_size - 1);
> + ssize_t size, written;
> +
> + /*
> + * We need to write a whole slot, including any trailing zeroes
> + * that may come at the end of it. Also, because instrs.buffer had
> + * been zero-initialised, there's no need to pad it with 0's
> + */
> + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + size = instrs->count * sizeof(u64);
> + written = min(ringbuf_size - start, size);
> +
> + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> +
> + if (written < size)
> + memcpy(queue->ringbuf->kmap,
> + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)],
> + size - written);
> +}
> +
> +struct panthor_job_cs_params {
> + u32 profile_mask;
> + u64 addr_reg; u64 val_reg;
> + u64 cycle_reg; u64 time_reg;
> + u64 sync_addr; u64 times_addr;
> + u64 cs_start; u64 cs_size;
> + u32 last_flush; u32 waitall_mask;
> +};
> +
> +static void
> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> {
> - struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> struct panthor_group *group = job->group;
> struct panthor_queue *queue = group->queues[job->queue_idx];
> struct panthor_device *ptdev = group->ptdev;
> struct panthor_scheduler *sched = ptdev->scheduler;
> - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> - u64 addr_reg = ptdev->csif_info.cs_reg_count -
> - ptdev->csif_info.unpreserved_cs_reg_count;
> - u64 val_reg = addr_reg + 2;
> - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> - struct dma_fence *done_fence;
> - int ret;
>
> - u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> + params->addr_reg = ptdev->csif_info.cs_reg_count -
> + ptdev->csif_info.unpreserved_cs_reg_count;
> + params->val_reg = params->addr_reg + 2;
> + params->cycle_reg = params->addr_reg;
> + params->time_reg = params->val_reg;
> +
> + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling_info.slots) +
> + (job->profiling_slot * sizeof(struct panthor_job_profiling_data));
> + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> +
> + params->cs_start = job->call_info.start;
> + params->cs_size = job->call_info.size;
> + params->last_flush = job->call_info.latest_flush;
> +
> + params->profile_mask = job->profile_mask;
> +}
> +
> +static void
> +prepare_job_instrs(const struct panthor_job_cs_params *params,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + const struct panthor_job_instr instr_seq[] = {
> /* MOV32 rX+2, cs.latest_flush */
> - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (2ull << 56) | (params->val_reg << 48) | params->last_flush),
>
> /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> +
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, cycles.before))),
> + /* STORE_STATE cycles */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (1ull << 56) | (params->time_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, time.before))),
> + /* STORE_STATE timer */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>
> /* MOV48 rX:rX+1, cs.start */
> - (1ull << 56) | (addr_reg << 48) | job->call_info.start,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> /* MOV32 rX+2, cs.size */
> - (2ull << 56) | (val_reg << 48) | job->call_info.size,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (2ull << 56) | (params->val_reg << 48) | params->cs_size),
> /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> - (3ull << 56) | (1 << 16),
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)),
> /* CALL rX:rX+1, rX+2 */
> - (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> +
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, cycles.after))),
> + /* STORE_STATE cycles */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> +
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (1ull << 56) | (params->time_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, time.after))),
> + /* STORE_STATE timer */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>
> /* MOV48 rX:rX+1, sync_addr */
> - (1ull << 56) | (addr_reg << 48) | sync_addr,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> /* MOV48 rX+2, #1 */
> - (1ull << 56) | (val_reg << 48) | 1,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->val_reg << 48) | 1),
> /* WAIT(all) */
> - (3ull << 56) | (waitall_mask << 16),
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (3ull << 56) | (params->waitall_mask << 16)),
> /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> + (params->val_reg << 32) | (0 << 16) | 1),
> /* ERROR_BARRIER, so we can recover from faults at job
> * boundaries.
> */
> - (47ull << 56),
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)),
> + };
> + u32 pad;
> +
> + /* NEED to be cacheline aligned to please the prefetcher. */
> + static_assert(sizeof(instrs->buffer) % 64 == 0,
> + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> +
> + /* Make sure we have enough storage to store the whole sequence. */
> + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) <=
> + ARRAY_SIZE(instrs->buffer),
> + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
We probably want to catch situations where instrs->buffer has gone
bigger than needed (say we found a way to drop instructions), so I
would turn the '<=' condition into an '=='.
> +
> + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
> + /* If the profile mask of this instruction is not enabled, skip it. */
> + if (instr_seq[i].profile_mask &&
> + !(instr_seq[i].profile_mask & params->profile_mask))
> + continue;
> +
> + instrs->buffer[instrs->count++] = instr_seq[i].instr;
> + }
> +
> + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + memset(&instrs->buffer[instrs->count], 0,
> + (pad - instrs->count) * sizeof(instrs->buffer[0]));
> + instrs->count = pad;
> +}
> +
> +static u32 calc_job_credits(u32 profile_mask)
> +{
> + struct panthor_job_ringbuf_instrs instrs;
> + struct panthor_job_cs_params params = {
> + .profile_mask = profile_mask,
> };
>
> - /* Need to be cacheline aligned to please the prefetcher. */
> - static_assert(sizeof(call_instrs) % 64 == 0,
> - "call_instrs is not aligned on a cacheline");
> + prepare_job_instrs(¶ms, &instrs);
> + return instrs.count;
> +}
> +
> +static struct dma_fence *
> +queue_run_job(struct drm_sched_job *sched_job)
> +{
> + struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> + struct panthor_group *group = job->group;
> + struct panthor_queue *queue = group->queues[job->queue_idx];
> + struct panthor_device *ptdev = group->ptdev;
> + struct panthor_scheduler *sched = ptdev->scheduler;
> + struct panthor_job_ringbuf_instrs instrs;
> + struct panthor_job_cs_params cs_params;
> + struct dma_fence *done_fence;
> + int ret;
>
> /* Stream size is zero, nothing to do except making sure all previously
> * submitted jobs are done before we signal the
> @@ -2900,17 +3078,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> queue->fence_ctx.id,
> atomic64_inc_return(&queue->fence_ctx.seqno));
>
> - memcpy(queue->ringbuf->kmap + ringbuf_insert,
> - call_instrs, sizeof(call_instrs));
> + job->profiling_slot = queue->profiling_info.profiling_seqno++;
> + if (queue->profiling_info.profiling_seqno == queue->profiling_info.slot_count)
> + queue->profiling_info.profiling_seqno = 0;
> +
> + job->ringbuf.start = queue->iface.input->insert;
> +
> + get_job_cs_params(job, &cs_params);
> + prepare_job_instrs(&cs_params, &instrs);
> + copy_instrs_to_ringbuf(queue, job, &instrs);
> +
> + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>
> panthor_job_get(&job->base);
> spin_lock(&queue->fence_ctx.lock);
> list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> spin_unlock(&queue->fence_ctx.lock);
>
> - job->ringbuf.start = queue->iface.input->insert;
> - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> -
> /* Make sure the ring buffer is updated before the INSERT
> * register.
> */
> @@ -3003,6 +3187,24 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
> .free_job = queue_free_job,
> };
>
> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
> + u32 cs_ringbuf_size)
> +{
> + u32 min_profiled_job_instrs = U32_MAX;
> + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
> +
> + for (u32 i = 0; i < last_flag; i++) {
> + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
> + min_profiled_job_instrs =
> + min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
> + }
Okay, I think this loop deserves an explanation. The goal is to
calculate the minimal size of a profile job so we can deduce the
maximum number of profiling slots that will be used simultaneously. We
ignore PANTHOR_DEVICE_PROFILING_DISABLED, because those jobs won't use
a profiling slot in the first place.
> +
> + drm_WARN_ON(&ptdev->base,
> + !IS_ALIGNED(min_profiled_job_instrs, NUM_INSTRS_PER_CACHE_LINE));
We can probably drop this WARN_ON(), it's supposed to be checked in
calc_job_credits().
> +
> + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
> +}
> +
> static struct panthor_queue *
> group_create_queue(struct panthor_group *group,
> const struct drm_panthor_queue_create *args)
> @@ -3056,9 +3258,38 @@ group_create_queue(struct panthor_group *group,
> goto err_free_queue;
> }
>
> + queue->profiling_info.slot_count =
> + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
> +
> + queue->profiling_info.slots =
> + panthor_kernel_bo_create(group->ptdev, group->vm,
> + queue->profiling_info.slot_count *
> + sizeof(struct panthor_job_profiling_data),
> + DRM_PANTHOR_BO_NO_MMAP,
> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> + PANTHOR_VM_KERNEL_AUTO_VA);
> +
> + if (IS_ERR(queue->profiling_info.slots)) {
> + ret = PTR_ERR(queue->profiling_info.slots);
> + goto err_free_queue;
> + }
> +
> + ret = panthor_kernel_bo_vmap(queue->profiling_info.slots);
> + if (ret)
> + goto err_free_queue;
> +
> + memset(queue->profiling_info.slots->kmap, 0,
> + queue->profiling_info.slot_count * sizeof(struct panthor_job_profiling_data));
I don't think we need to memset() the profiling buffer.
> +
> + /*
> + * Credit limit argument tells us the total number of instructions
> + * across all CS slots in the ringbuffer, with some jobs requiring
> + * twice as many as others, depending on their profiling status.
> + */
> ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> group->ptdev->scheduler->wq, 1,
> - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> + args->ringbuf_size / sizeof(u64),
> 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> group->ptdev->reset.wq,
> NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3354,6 +3585,7 @@ panthor_job_create(struct panthor_file *pfile,
> {
> struct panthor_group_pool *gpool = pfile->groups;
> struct panthor_job *job;
> + u32 credits;
> int ret;
>
> if (qsubmit->pad)
> @@ -3407,9 +3639,16 @@ panthor_job_create(struct panthor_file *pfile,
> }
> }
>
> + job->profile_mask = pfile->ptdev->profile_mask;
> + credits = calc_job_credits(job->profile_mask);
> + if (credits == 0) {
> + ret = -EINVAL;
> + goto err_put_job;
> + }
> +
> ret = drm_sched_job_init(&job->base,
> &job->group->queues[job->queue_idx]->entity,
> - 1, job->group);
> + credits, job->group);
> if (ret)
> goto err_put_job;
>
Just add a bunch of minor comments (mostly cosmetic changes), but the
implementation looks good to me.
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-04 7:49 ` Boris Brezillon
@ 2024-09-12 15:03 ` Adrián Larumbe
2024-09-12 15:10 ` Boris Brezillon
0 siblings, 1 reply; 15+ messages in thread
From: Adrián Larumbe @ 2024-09-12 15:03 UTC (permalink / raw)
To: Boris Brezillon
Cc: Steven Price, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, David Airlie, Daniel Vetter, Sumit Semwal,
Christian König, kernel, dri-devel, linux-kernel,
linux-media, linaro-mm-sig
Hi Boris, thanks for the remarks,
On 04.09.2024 09:49, Boris Brezillon wrote:
> On Tue, 3 Sep 2024 21:25:35 +0100
> Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
>
> > Enable calculations of job submission times in clock cycles and wall
> > time. This is done by expanding the boilerplate command stream when running
> > a job to include instructions that compute said times right before an after
> > a user CS.
> >
> > A separate kernel BO is created per queue to store those values. Jobs can
> > access their sampled data through a slots buffer-specific index different
> > from that of the queue's ringbuffer. The reason for this is saving memory
> > on the profiling information kernel BO, since the amount of simultaneous
> > profiled jobs we can write into the queue's ringbuffer might be much
> > smaller than for regular jobs, as the former take more CSF instructions.
> >
> > This commit is done in preparation for enabling DRM fdinfo support in the
> > Panthor driver, which depends on the numbers calculated herein.
> >
> > A profile mode mask has been added that will in a future commit allow UM to
> > toggle performance metric sampling behaviour, which is disabled by default
> > to save power. When a ringbuffer CS is constructed, timestamp and cycling
> > sampling instructions are added depending on the enabled flags in the
> > profiling mask.
> >
> > A helper was provided that calculates the number of instructions for a
> > given set of enablement mask, and these are passed as the number of credits
> > when initialising a DRM scheduler job.
> >
> > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> > ---
> > drivers/gpu/drm/panthor/panthor_device.h | 22 ++
> > drivers/gpu/drm/panthor/panthor_sched.c | 327 ++++++++++++++++++++---
> > 2 files changed, 305 insertions(+), 44 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > index e388c0472ba7..a48e30d0af30 100644
> > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -66,6 +66,25 @@ struct panthor_irq {
> > atomic_t suspended;
> > };
> >
> > +/**
> > + * enum panthor_device_profiling_mode - Profiling state
> > + */
> > +enum panthor_device_profiling_flags {
> > + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> > + PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> > +
> > + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> > + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> > +
> > + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> > + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> > +
> > + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> > + PANTHOR_DEVICE_PROFILING_ALL =
> > + PANTHOR_DEVICE_PROFILING_CYCLES |
> > + PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > +};
> > +
> > /**
> > * struct panthor_device - Panthor device
> > */
> > @@ -162,6 +181,9 @@ struct panthor_device {
> > */
> > struct page *dummy_latest_flush;
> > } pm;
> > +
> > + /** @profile_mask: User-set profiling flags for job accounting. */
> > + u32 profile_mask;
> > };
> >
> > /**
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index c426a392b081..b087648bf59a 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -93,6 +93,9 @@
> > #define MIN_CSGS 3
> > #define MAX_CSG_PRIO 0xf
> >
> > +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
> > +#define MAX_INSTRS_PER_JOB 32
> > +
> > struct panthor_group;
> >
> > /**
> > @@ -476,6 +479,18 @@ struct panthor_queue {
> > */
> > struct list_head in_flight_jobs;
> > } fence_ctx;
> > +
> > + /** @profiling_info: Job profiling data slots and access information. */
> > + struct {
> > + /** @slots: Kernel BO holding the slots. */
> > + struct panthor_kernel_bo *slots;
> > +
> > + /** @slot_count: Number of jobs ringbuffer can hold at once. */
> > + u32 slot_count;
> > +
> > + /** @profiling_seqno: Index of the next available profiling information slot. */
> > + u32 profiling_seqno;
>
> Nit: no need to repeat profiling as it's under the profiling_info
> struct. I would simply name that one 'seqno'.
>
> > + } profiling_info;
>
> s/profiling_info/profiling/ ?
>
> > };
> >
> > /**
> > @@ -661,6 +676,18 @@ struct panthor_group {
> > struct list_head wait_node;
> > };
> >
> > +struct panthor_job_profiling_data {
> > + struct {
> > + u64 before;
> > + u64 after;
> > + } cycles;
> > +
> > + struct {
> > + u64 before;
> > + u64 after;
> > + } time;
> > +};
> > +
> > /**
> > * group_queue_work() - Queue a group work
> > * @group: Group to queue the work for.
> > @@ -774,6 +801,12 @@ struct panthor_job {
> >
> > /** @done_fence: Fence signaled when the job is finished or cancelled. */
> > struct dma_fence *done_fence;
> > +
> > + /** @profile_mask: Current device job profiling enablement bitmask. */
> > + u32 profile_mask;
> > +
> > + /** @profile_slot: Job profiling information index in the profiling slots BO. */
> > + u32 profiling_slot;
>
> Nit: we tend to group fields together under sub-structs, so I'd say:
>
> struct {
> u32 mask; // or flags
> u32 slot;
> } profiling;
>
> > };
> >
> > static void
> > @@ -838,6 +871,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> >
> > panthor_kernel_bo_destroy(queue->ringbuf);
> > panthor_kernel_bo_destroy(queue->iface.mem);
> > + panthor_kernel_bo_destroy(queue->profiling_info.slots);
> >
> > /* Release the last_fence we were holding, if any. */
> > dma_fence_put(queue->fence_ctx.last_fence);
> > @@ -1982,8 +2016,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> > }
> > }
> >
> > -#define NUM_INSTRS_PER_SLOT 16
> > -
> > static void
> > group_term_post_processing(struct panthor_group *group)
> > {
> > @@ -2815,65 +2847,211 @@ static void group_sync_upd_work(struct work_struct *work)
> > group_put(group);
> > }
> >
> > -static struct dma_fence *
> > -queue_run_job(struct drm_sched_job *sched_job)
> > +struct panthor_job_ringbuf_instrs {
> > + u64 buffer[MAX_INSTRS_PER_JOB];
> > + u32 count;
> > +};
> > +
> > +struct panthor_job_instr {
> > + u32 profile_mask;
> > + u64 instr;
> > +};
> > +
> > +#define JOB_INSTR(__prof, __instr) \
> > + { \
> > + .profile_mask = __prof, \
> > + .instr = __instr, \
> > + }
> > +
> > +static void
> > +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> > + struct panthor_job *job,
> > + struct panthor_job_ringbuf_instrs *instrs)
> > +{
> > + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > + u32 start = job->ringbuf.start & (ringbuf_size - 1);
> > + ssize_t size, written;
> > +
> > + /*
> > + * We need to write a whole slot, including any trailing zeroes
> > + * that may come at the end of it. Also, because instrs.buffer had
> > + * been zero-initialised, there's no need to pad it with 0's
> > + */
> > + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> > + size = instrs->count * sizeof(u64);
> > + written = min(ringbuf_size - start, size);
> > +
> > + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> > +
> > + if (written < size)
> > + memcpy(queue->ringbuf->kmap,
> > + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)],
> > + size - written);
> > +}
> > +
> > +struct panthor_job_cs_params {
> > + u32 profile_mask;
> > + u64 addr_reg; u64 val_reg;
> > + u64 cycle_reg; u64 time_reg;
> > + u64 sync_addr; u64 times_addr;
> > + u64 cs_start; u64 cs_size;
> > + u32 last_flush; u32 waitall_mask;
> > +};
> > +
> > +static void
> > +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> > {
> > - struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> > struct panthor_group *group = job->group;
> > struct panthor_queue *queue = group->queues[job->queue_idx];
> > struct panthor_device *ptdev = group->ptdev;
> > struct panthor_scheduler *sched = ptdev->scheduler;
> > - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> > - u64 addr_reg = ptdev->csif_info.cs_reg_count -
> > - ptdev->csif_info.unpreserved_cs_reg_count;
> > - u64 val_reg = addr_reg + 2;
> > - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > - struct dma_fence *done_fence;
> > - int ret;
> >
> > - u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> > + params->addr_reg = ptdev->csif_info.cs_reg_count -
> > + ptdev->csif_info.unpreserved_cs_reg_count;
> > + params->val_reg = params->addr_reg + 2;
> > + params->cycle_reg = params->addr_reg;
> > + params->time_reg = params->val_reg;
> > +
> > + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling_info.slots) +
> > + (job->profiling_slot * sizeof(struct panthor_job_profiling_data));
> > + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > +
> > + params->cs_start = job->call_info.start;
> > + params->cs_size = job->call_info.size;
> > + params->last_flush = job->call_info.latest_flush;
> > +
> > + params->profile_mask = job->profile_mask;
> > +}
> > +
> > +static void
> > +prepare_job_instrs(const struct panthor_job_cs_params *params,
> > + struct panthor_job_ringbuf_instrs *instrs)
> > +{
> > + const struct panthor_job_instr instr_seq[] = {
> > /* MOV32 rX+2, cs.latest_flush */
> > - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (2ull << 56) | (params->val_reg << 48) | params->last_flush),
> >
> > /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> > - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> > +
> > + /* MOV48 rX:rX+1, cycles_offset */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > + (1ull << 56) | (params->cycle_reg << 48) |
> > + (params->times_addr +
> > + offsetof(struct panthor_job_profiling_data, cycles.before))),
> > + /* STORE_STATE cycles */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > + /* MOV48 rX:rX+1, time_offset */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > + (1ull << 56) | (params->time_reg << 48) |
> > + (params->times_addr +
> > + offsetof(struct panthor_job_profiling_data, time.before))),
> > + /* STORE_STATE timer */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> >
> > /* MOV48 rX:rX+1, cs.start */
> > - (1ull << 56) | (addr_reg << 48) | job->call_info.start,
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> > /* MOV32 rX+2, cs.size */
> > - (2ull << 56) | (val_reg << 48) | job->call_info.size,
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (2ull << 56) | (params->val_reg << 48) | params->cs_size),
> > /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> > - (3ull << 56) | (1 << 16),
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)),
> > /* CALL rX:rX+1, rX+2 */
> > - (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> > +
> > + /* MOV48 rX:rX+1, cycles_offset */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > + (1ull << 56) | (params->cycle_reg << 48) |
> > + (params->times_addr +
> > + offsetof(struct panthor_job_profiling_data, cycles.after))),
> > + /* STORE_STATE cycles */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > +
> > + /* MOV48 rX:rX+1, time_offset */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > + (1ull << 56) | (params->time_reg << 48) |
> > + (params->times_addr +
> > + offsetof(struct panthor_job_profiling_data, time.after))),
> > + /* STORE_STATE timer */
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> >
> > /* MOV48 rX:rX+1, sync_addr */
> > - (1ull << 56) | (addr_reg << 48) | sync_addr,
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> > /* MOV48 rX+2, #1 */
> > - (1ull << 56) | (val_reg << 48) | 1,
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (1ull << 56) | (params->val_reg << 48) | 1),
> > /* WAIT(all) */
> > - (3ull << 56) | (waitall_mask << 16),
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (3ull << 56) | (params->waitall_mask << 16)),
> > /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> > - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> > -
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> > + (params->val_reg << 32) | (0 << 16) | 1),
> > /* ERROR_BARRIER, so we can recover from faults at job
> > * boundaries.
> > */
> > - (47ull << 56),
> > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)),
> > + };
> > + u32 pad;
> > +
> > + /* NEED to be cacheline aligned to please the prefetcher. */
> > + static_assert(sizeof(instrs->buffer) % 64 == 0,
> > + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> > +
> > + /* Make sure we have enough storage to store the whole sequence. */
> > + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) <=
> > + ARRAY_SIZE(instrs->buffer),
> > + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
>
> We probably want to catch situations where instrs->buffer has gone
> bigger than needed (say we found a way to drop instructions), so I
> would turn the '<=' condition into an '=='.
I did this but it's triggering the static assert, because the instr_seq array has 19 elements,
which when aligned to NUM_INSTRS_PER_CACHE_LINE (8 I think) renders 24, still less than the
32 slots in instrs->buffer. Having a slightly oversized instrs.buffer array isn't a problem
though because instrs.count is being used when copying them into the ringbuffer, so I think
leaving this assert as an <= should be alright.
> > +
> > + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
> > + /* If the profile mask of this instruction is not enabled, skip it. */
> > + if (instr_seq[i].profile_mask &&
> > + !(instr_seq[i].profile_mask & params->profile_mask))
> > + continue;
> > +
> > + instrs->buffer[instrs->count++] = instr_seq[i].instr;
> > + }
> > +
> > + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> > + memset(&instrs->buffer[instrs->count], 0,
> > + (pad - instrs->count) * sizeof(instrs->buffer[0]));
> > + instrs->count = pad;
> > +}
> > +
> > +static u32 calc_job_credits(u32 profile_mask)
> > +{
> > + struct panthor_job_ringbuf_instrs instrs;
> > + struct panthor_job_cs_params params = {
> > + .profile_mask = profile_mask,
> > };
> >
> > - /* Need to be cacheline aligned to please the prefetcher. */
> > - static_assert(sizeof(call_instrs) % 64 == 0,
> > - "call_instrs is not aligned on a cacheline");
> > + prepare_job_instrs(¶ms, &instrs);
> > + return instrs.count;
> > +}
> > +
> > +static struct dma_fence *
> > +queue_run_job(struct drm_sched_job *sched_job)
> > +{
> > + struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> > + struct panthor_group *group = job->group;
> > + struct panthor_queue *queue = group->queues[job->queue_idx];
> > + struct panthor_device *ptdev = group->ptdev;
> > + struct panthor_scheduler *sched = ptdev->scheduler;
> > + struct panthor_job_ringbuf_instrs instrs;
> > + struct panthor_job_cs_params cs_params;
> > + struct dma_fence *done_fence;
> > + int ret;
> >
> > /* Stream size is zero, nothing to do except making sure all previously
> > * submitted jobs are done before we signal the
> > @@ -2900,17 +3078,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> > queue->fence_ctx.id,
> > atomic64_inc_return(&queue->fence_ctx.seqno));
> >
> > - memcpy(queue->ringbuf->kmap + ringbuf_insert,
> > - call_instrs, sizeof(call_instrs));
> > + job->profiling_slot = queue->profiling_info.profiling_seqno++;
> > + if (queue->profiling_info.profiling_seqno == queue->profiling_info.slot_count)
> > + queue->profiling_info.profiling_seqno = 0;
> > +
> > + job->ringbuf.start = queue->iface.input->insert;
> > +
> > + get_job_cs_params(job, &cs_params);
> > + prepare_job_instrs(&cs_params, &instrs);
> > + copy_instrs_to_ringbuf(queue, job, &instrs);
> > +
> > + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
> >
> > panthor_job_get(&job->base);
> > spin_lock(&queue->fence_ctx.lock);
> > list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> > spin_unlock(&queue->fence_ctx.lock);
> >
> > - job->ringbuf.start = queue->iface.input->insert;
> > - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> > -
> > /* Make sure the ring buffer is updated before the INSERT
> > * register.
> > */
> > @@ -3003,6 +3187,24 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
> > .free_job = queue_free_job,
> > };
> >
> > +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
> > + u32 cs_ringbuf_size)
> > +{
> > + u32 min_profiled_job_instrs = U32_MAX;
> > + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
> > +
> > + for (u32 i = 0; i < last_flag; i++) {
> > + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
> > + min_profiled_job_instrs =
> > + min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
> > + }
>
> Okay, I think this loop deserves an explanation. The goal is to
> calculate the minimal size of a profile job so we can deduce the
> maximum number of profiling slots that will be used simultaneously. We
> ignore PANTHOR_DEVICE_PROFILING_DISABLED, because those jobs won't use
> a profiling slot in the first place.
>
> > +
> > + drm_WARN_ON(&ptdev->base,
> > + !IS_ALIGNED(min_profiled_job_instrs, NUM_INSTRS_PER_CACHE_LINE));
>
> We can probably drop this WARN_ON(), it's supposed to be checked in
> calc_job_credits().
>
> > +
> > + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
> > +}
> > +
> > static struct panthor_queue *
> > group_create_queue(struct panthor_group *group,
> > const struct drm_panthor_queue_create *args)
> > @@ -3056,9 +3258,38 @@ group_create_queue(struct panthor_group *group,
> > goto err_free_queue;
> > }
> >
> > + queue->profiling_info.slot_count =
> > + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
> > +
> > + queue->profiling_info.slots =
> > + panthor_kernel_bo_create(group->ptdev, group->vm,
> > + queue->profiling_info.slot_count *
> > + sizeof(struct panthor_job_profiling_data),
> > + DRM_PANTHOR_BO_NO_MMAP,
> > + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> > + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> > + PANTHOR_VM_KERNEL_AUTO_VA);
> > +
> > + if (IS_ERR(queue->profiling_info.slots)) {
> > + ret = PTR_ERR(queue->profiling_info.slots);
> > + goto err_free_queue;
> > + }
> > +
> > + ret = panthor_kernel_bo_vmap(queue->profiling_info.slots);
> > + if (ret)
> > + goto err_free_queue;
> > +
> > + memset(queue->profiling_info.slots->kmap, 0,
> > + queue->profiling_info.slot_count * sizeof(struct panthor_job_profiling_data));
>
> I don't think we need to memset() the profiling buffer.
>
> > +
> > + /*
> > + * Credit limit argument tells us the total number of instructions
> > + * across all CS slots in the ringbuffer, with some jobs requiring
> > + * twice as many as others, depending on their profiling status.
> > + */
> > ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> > group->ptdev->scheduler->wq, 1,
> > - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> > + args->ringbuf_size / sizeof(u64),
> > 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> > group->ptdev->reset.wq,
> > NULL, "panthor-queue", group->ptdev->base.dev);
> > @@ -3354,6 +3585,7 @@ panthor_job_create(struct panthor_file *pfile,
> > {
> > struct panthor_group_pool *gpool = pfile->groups;
> > struct panthor_job *job;
> > + u32 credits;
> > int ret;
> >
> > if (qsubmit->pad)
> > @@ -3407,9 +3639,16 @@ panthor_job_create(struct panthor_file *pfile,
> > }
> > }
> >
> > + job->profile_mask = pfile->ptdev->profile_mask;
> > + credits = calc_job_credits(job->profile_mask);
> > + if (credits == 0) {
> > + ret = -EINVAL;
> > + goto err_put_job;
> > + }
> > +
> > ret = drm_sched_job_init(&job->base,
> > &job->group->queues[job->queue_idx]->entity,
> > - 1, job->group);
> > + credits, job->group);
> > if (ret)
> > goto err_put_job;
> >
>
> Just add a bunch of minor comments (mostly cosmetic changes), but the
> implementation looks good to me.
>
> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Adrian Larumbe
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-12 15:03 ` Adrián Larumbe
@ 2024-09-12 15:10 ` Boris Brezillon
0 siblings, 0 replies; 15+ messages in thread
From: Boris Brezillon @ 2024-09-12 15:10 UTC (permalink / raw)
To: Adrián Larumbe
Cc: Steven Price, Liviu Dudau, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, David Airlie, Daniel Vetter, Sumit Semwal,
Christian König, kernel, dri-devel, linux-kernel,
linux-media, linaro-mm-sig
On Thu, 12 Sep 2024 16:03:37 +0100
Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
> Hi Boris, thanks for the remarks,
>
> On 04.09.2024 09:49, Boris Brezillon wrote:
> > On Tue, 3 Sep 2024 21:25:35 +0100
> > Adrián Larumbe <adrian.larumbe@collabora.com> wrote:
> >
> > > Enable calculations of job submission times in clock cycles and wall
> > > time. This is done by expanding the boilerplate command stream when running
> > > a job to include instructions that compute said times right before an after
> > > a user CS.
> > >
> > > A separate kernel BO is created per queue to store those values. Jobs can
> > > access their sampled data through a slots buffer-specific index different
> > > from that of the queue's ringbuffer. The reason for this is saving memory
> > > on the profiling information kernel BO, since the amount of simultaneous
> > > profiled jobs we can write into the queue's ringbuffer might be much
> > > smaller than for regular jobs, as the former take more CSF instructions.
> > >
> > > This commit is done in preparation for enabling DRM fdinfo support in the
> > > Panthor driver, which depends on the numbers calculated herein.
> > >
> > > A profile mode mask has been added that will in a future commit allow UM to
> > > toggle performance metric sampling behaviour, which is disabled by default
> > > to save power. When a ringbuffer CS is constructed, timestamp and cycling
> > > sampling instructions are added depending on the enabled flags in the
> > > profiling mask.
> > >
> > > A helper was provided that calculates the number of instructions for a
> > > given set of enablement mask, and these are passed as the number of credits
> > > when initialising a DRM scheduler job.
> > >
> > > Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> > > ---
> > > drivers/gpu/drm/panthor/panthor_device.h | 22 ++
> > > drivers/gpu/drm/panthor/panthor_sched.c | 327 ++++++++++++++++++++---
> > > 2 files changed, 305 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > > index e388c0472ba7..a48e30d0af30 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_device.h
> > > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > > @@ -66,6 +66,25 @@ struct panthor_irq {
> > > atomic_t suspended;
> > > };
> > >
> > > +/**
> > > + * enum panthor_device_profiling_mode - Profiling state
> > > + */
> > > +enum panthor_device_profiling_flags {
> > > + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> > > + PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> > > +
> > > + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> > > + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> > > +
> > > + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> > > + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> > > +
> > > + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> > > + PANTHOR_DEVICE_PROFILING_ALL =
> > > + PANTHOR_DEVICE_PROFILING_CYCLES |
> > > + PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > +};
> > > +
> > > /**
> > > * struct panthor_device - Panthor device
> > > */
> > > @@ -162,6 +181,9 @@ struct panthor_device {
> > > */
> > > struct page *dummy_latest_flush;
> > > } pm;
> > > +
> > > + /** @profile_mask: User-set profiling flags for job accounting. */
> > > + u32 profile_mask;
> > > };
> > >
> > > /**
> > > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > > index c426a392b081..b087648bf59a 100644
> > > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > > @@ -93,6 +93,9 @@
> > > #define MIN_CSGS 3
> > > #define MAX_CSG_PRIO 0xf
> > >
> > > +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
> > > +#define MAX_INSTRS_PER_JOB 32
> > > +
> > > struct panthor_group;
> > >
> > > /**
> > > @@ -476,6 +479,18 @@ struct panthor_queue {
> > > */
> > > struct list_head in_flight_jobs;
> > > } fence_ctx;
> > > +
> > > + /** @profiling_info: Job profiling data slots and access information. */
> > > + struct {
> > > + /** @slots: Kernel BO holding the slots. */
> > > + struct panthor_kernel_bo *slots;
> > > +
> > > + /** @slot_count: Number of jobs ringbuffer can hold at once. */
> > > + u32 slot_count;
> > > +
> > > + /** @profiling_seqno: Index of the next available profiling information slot. */
> > > + u32 profiling_seqno;
> >
> > Nit: no need to repeat profiling as it's under the profiling_info
> > struct. I would simply name that one 'seqno'.
> >
> > > + } profiling_info;
> >
> > s/profiling_info/profiling/ ?
> >
> > > };
> > >
> > > /**
> > > @@ -661,6 +676,18 @@ struct panthor_group {
> > > struct list_head wait_node;
> > > };
> > >
> > > +struct panthor_job_profiling_data {
> > > + struct {
> > > + u64 before;
> > > + u64 after;
> > > + } cycles;
> > > +
> > > + struct {
> > > + u64 before;
> > > + u64 after;
> > > + } time;
> > > +};
> > > +
> > > /**
> > > * group_queue_work() - Queue a group work
> > > * @group: Group to queue the work for.
> > > @@ -774,6 +801,12 @@ struct panthor_job {
> > >
> > > /** @done_fence: Fence signaled when the job is finished or cancelled. */
> > > struct dma_fence *done_fence;
> > > +
> > > + /** @profile_mask: Current device job profiling enablement bitmask. */
> > > + u32 profile_mask;
> > > +
> > > + /** @profile_slot: Job profiling information index in the profiling slots BO. */
> > > + u32 profiling_slot;
> >
> > Nit: we tend to group fields together under sub-structs, so I'd say:
> >
> > struct {
> > u32 mask; // or flags
> > u32 slot;
> > } profiling;
> >
> > > };
> > >
> > > static void
> > > @@ -838,6 +871,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
> > >
> > > panthor_kernel_bo_destroy(queue->ringbuf);
> > > panthor_kernel_bo_destroy(queue->iface.mem);
> > > + panthor_kernel_bo_destroy(queue->profiling_info.slots);
> > >
> > > /* Release the last_fence we were holding, if any. */
> > > dma_fence_put(queue->fence_ctx.last_fence);
> > > @@ -1982,8 +2016,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> > > }
> > > }
> > >
> > > -#define NUM_INSTRS_PER_SLOT 16
> > > -
> > > static void
> > > group_term_post_processing(struct panthor_group *group)
> > > {
> > > @@ -2815,65 +2847,211 @@ static void group_sync_upd_work(struct work_struct *work)
> > > group_put(group);
> > > }
> > >
> > > -static struct dma_fence *
> > > -queue_run_job(struct drm_sched_job *sched_job)
> > > +struct panthor_job_ringbuf_instrs {
> > > + u64 buffer[MAX_INSTRS_PER_JOB];
> > > + u32 count;
> > > +};
> > > +
> > > +struct panthor_job_instr {
> > > + u32 profile_mask;
> > > + u64 instr;
> > > +};
> > > +
> > > +#define JOB_INSTR(__prof, __instr) \
> > > + { \
> > > + .profile_mask = __prof, \
> > > + .instr = __instr, \
> > > + }
> > > +
> > > +static void
> > > +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> > > + struct panthor_job *job,
> > > + struct panthor_job_ringbuf_instrs *instrs)
> > > +{
> > > + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > > + u32 start = job->ringbuf.start & (ringbuf_size - 1);
> > > + ssize_t size, written;
> > > +
> > > + /*
> > > + * We need to write a whole slot, including any trailing zeroes
> > > + * that may come at the end of it. Also, because instrs.buffer had
> > > + * been zero-initialised, there's no need to pad it with 0's
> > > + */
> > > + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> > > + size = instrs->count * sizeof(u64);
> > > + written = min(ringbuf_size - start, size);
> > > +
> > > + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> > > +
> > > + if (written < size)
> > > + memcpy(queue->ringbuf->kmap,
> > > + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)],
> > > + size - written);
> > > +}
> > > +
> > > +struct panthor_job_cs_params {
> > > + u32 profile_mask;
> > > + u64 addr_reg; u64 val_reg;
> > > + u64 cycle_reg; u64 time_reg;
> > > + u64 sync_addr; u64 times_addr;
> > > + u64 cs_start; u64 cs_size;
> > > + u32 last_flush; u32 waitall_mask;
> > > +};
> > > +
> > > +static void
> > > +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> > > {
> > > - struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> > > struct panthor_group *group = job->group;
> > > struct panthor_queue *queue = group->queues[job->queue_idx];
> > > struct panthor_device *ptdev = group->ptdev;
> > > struct panthor_scheduler *sched = ptdev->scheduler;
> > > - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> > > - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> > > - u64 addr_reg = ptdev->csif_info.cs_reg_count -
> > > - ptdev->csif_info.unpreserved_cs_reg_count;
> > > - u64 val_reg = addr_reg + 2;
> > > - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > > - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > > - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > > - struct dma_fence *done_fence;
> > > - int ret;
> > >
> > > - u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> > > + params->addr_reg = ptdev->csif_info.cs_reg_count -
> > > + ptdev->csif_info.unpreserved_cs_reg_count;
> > > + params->val_reg = params->addr_reg + 2;
> > > + params->cycle_reg = params->addr_reg;
> > > + params->time_reg = params->val_reg;
> > > +
> > > + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> > > + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> > > + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling_info.slots) +
> > > + (job->profiling_slot * sizeof(struct panthor_job_profiling_data));
> > > + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> > > +
> > > + params->cs_start = job->call_info.start;
> > > + params->cs_size = job->call_info.size;
> > > + params->last_flush = job->call_info.latest_flush;
> > > +
> > > + params->profile_mask = job->profile_mask;
> > > +}
> > > +
> > > +static void
> > > +prepare_job_instrs(const struct panthor_job_cs_params *params,
> > > + struct panthor_job_ringbuf_instrs *instrs)
> > > +{
> > > + const struct panthor_job_instr instr_seq[] = {
> > > /* MOV32 rX+2, cs.latest_flush */
> > > - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (2ull << 56) | (params->val_reg << 48) | params->last_flush),
> > >
> > > /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> > > - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> > > +
> > > + /* MOV48 rX:rX+1, cycles_offset */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > + (1ull << 56) | (params->cycle_reg << 48) |
> > > + (params->times_addr +
> > > + offsetof(struct panthor_job_profiling_data, cycles.before))),
> > > + /* STORE_STATE cycles */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > > + /* MOV48 rX:rX+1, time_offset */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > + (1ull << 56) | (params->time_reg << 48) |
> > > + (params->times_addr +
> > > + offsetof(struct panthor_job_profiling_data, time.before))),
> > > + /* STORE_STATE timer */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> > >
> > > /* MOV48 rX:rX+1, cs.start */
> > > - (1ull << 56) | (addr_reg << 48) | job->call_info.start,
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> > > /* MOV32 rX+2, cs.size */
> > > - (2ull << 56) | (val_reg << 48) | job->call_info.size,
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (2ull << 56) | (params->val_reg << 48) | params->cs_size),
> > > /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> > > - (3ull << 56) | (1 << 16),
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)),
> > > /* CALL rX:rX+1, rX+2 */
> > > - (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> > > +
> > > + /* MOV48 rX:rX+1, cycles_offset */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > + (1ull << 56) | (params->cycle_reg << 48) |
> > > + (params->times_addr +
> > > + offsetof(struct panthor_job_profiling_data, cycles.after))),
> > > + /* STORE_STATE cycles */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> > > + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> > > +
> > > + /* MOV48 rX:rX+1, time_offset */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > + (1ull << 56) | (params->time_reg << 48) |
> > > + (params->times_addr +
> > > + offsetof(struct panthor_job_profiling_data, time.after))),
> > > + /* STORE_STATE timer */
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> > > + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
> > >
> > > /* MOV48 rX:rX+1, sync_addr */
> > > - (1ull << 56) | (addr_reg << 48) | sync_addr,
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> > > /* MOV48 rX+2, #1 */
> > > - (1ull << 56) | (val_reg << 48) | 1,
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (1ull << 56) | (params->val_reg << 48) | 1),
> > > /* WAIT(all) */
> > > - (3ull << 56) | (waitall_mask << 16),
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (3ull << 56) | (params->waitall_mask << 16)),
> > > /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> > > - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> > > -
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> > > + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> > > + (params->val_reg << 32) | (0 << 16) | 1),
> > > /* ERROR_BARRIER, so we can recover from faults at job
> > > * boundaries.
> > > */
> > > - (47ull << 56),
> > > + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)),
> > > + };
> > > + u32 pad;
> > > +
> > > + /* NEED to be cacheline aligned to please the prefetcher. */
> > > + static_assert(sizeof(instrs->buffer) % 64 == 0,
> > > + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> > > +
> > > + /* Make sure we have enough storage to store the whole sequence. */
> > > + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) <=
> > > + ARRAY_SIZE(instrs->buffer),
> > > + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
> >
> > We probably want to catch situations where instrs->buffer has gone
> > bigger than needed (say we found a way to drop instructions), so I
> > would turn the '<=' condition into an '=='.
>
> I did this but it's triggering the static assert, because the instr_seq array has 19 elements,
> which when aligned to NUM_INSTRS_PER_CACHE_LINE (8 I think) renders 24, still less than the
> 32 slots in instrs->buffer. Having a slightly oversized instrs.buffer array isn't a problem
> though because instrs.count is being used when copying them into the ringbuffer, so I think
> leaving this assert as an <= should be alright.
The whole point is to have a buffer that's the right size, rather than
bigger than needed. So I'd suggest changing the MAX_INSTRS definition
to make it 24 instead.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-03 20:25 ` [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting Adrián Larumbe
2024-09-04 7:49 ` Boris Brezillon
@ 2024-09-04 16:10 ` kernel test robot
2024-09-04 17:02 ` kernel test robot
2024-09-04 18:55 ` Liviu Dudau
3 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2024-09-04 16:10 UTC (permalink / raw)
To: Adrián Larumbe, Boris Brezillon, Steven Price, Liviu Dudau,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Daniel Vetter, Sumit Semwal, Christian König
Cc: oe-kbuild-all, kernel, Adrián Larumbe, dri-devel,
linux-kernel, linux-media, linaro-mm-sig
Hi Adrián,
kernel test robot noticed the following build warnings:
[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on linus/master v6.11-rc6 next-20240904]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Adri-n-Larumbe/drm-panthor-introduce-job-cycle-and-timestamp-accounting/20240904-042645
base: git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link: https://lore.kernel.org/r/20240903202541.430225-2-adrian.larumbe%40collabora.com
patch subject: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
config: x86_64-buildonly-randconfig-002-20240904 (https://download.01.org/0day-ci/archive/20240904/202409042317.CRCMb6bs-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240904/202409042317.CRCMb6bs-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409042317.CRCMb6bs-lkp@intel.com/
All warnings (new ones prefixed by >>):
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'runnable' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'idle' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'waiting' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'has_ref' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'in_progress' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:322: warning: Excess struct member 'stopped_groups' description in 'panthor_scheduler'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'mem' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'input' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'output' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'input_fw_va' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'output_fw_va' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'gpu_va' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'ref' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'gt' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'sync64' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'bo' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'offset' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'kmap' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'lock' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'id' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'seqno' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'last_fence' description in 'panthor_queue'
drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'in_flight_jobs' description in 'panthor_queue'
>> drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'slots' description in 'panthor_queue'
>> drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'slot_count' description in 'panthor_queue'
>> drivers/gpu/drm/panthor/panthor_sched.c:494: warning: Excess struct member 'profiling_seqno' description in 'panthor_queue'
>> drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Function parameter or struct member 'profiling_slot' not described in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'start' description in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'size' description in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'latest_flush' description in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'start' description in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'end' description in 'panthor_job'
>> drivers/gpu/drm/panthor/panthor_sched.c:810: warning: Excess struct member 'profile_slot' description in 'panthor_job'
drivers/gpu/drm/panthor/panthor_sched.c:1731: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_fw_events'
drivers/gpu/drm/panthor/panthor_sched.c:1731: warning: Function parameter or struct member 'events' not described in 'panthor_sched_report_fw_events'
drivers/gpu/drm/panthor/panthor_sched.c:2623: warning: Function parameter or struct member 'ptdev' not described in 'panthor_sched_report_mmu_fault'
vim +494 drivers/gpu/drm/panthor/panthor_sched.c
de85488138247d Boris Brezillon 2024-02-29 397
de85488138247d Boris Brezillon 2024-02-29 398 /** @ringbuf: Command stream ring-buffer. */
de85488138247d Boris Brezillon 2024-02-29 399 struct panthor_kernel_bo *ringbuf;
de85488138247d Boris Brezillon 2024-02-29 400
de85488138247d Boris Brezillon 2024-02-29 401 /** @iface: Firmware interface. */
de85488138247d Boris Brezillon 2024-02-29 402 struct {
de85488138247d Boris Brezillon 2024-02-29 403 /** @mem: FW memory allocated for this interface. */
de85488138247d Boris Brezillon 2024-02-29 404 struct panthor_kernel_bo *mem;
de85488138247d Boris Brezillon 2024-02-29 405
de85488138247d Boris Brezillon 2024-02-29 406 /** @input: Input interface. */
de85488138247d Boris Brezillon 2024-02-29 407 struct panthor_fw_ringbuf_input_iface *input;
de85488138247d Boris Brezillon 2024-02-29 408
de85488138247d Boris Brezillon 2024-02-29 409 /** @output: Output interface. */
de85488138247d Boris Brezillon 2024-02-29 410 const struct panthor_fw_ringbuf_output_iface *output;
de85488138247d Boris Brezillon 2024-02-29 411
de85488138247d Boris Brezillon 2024-02-29 412 /** @input_fw_va: FW virtual address of the input interface buffer. */
de85488138247d Boris Brezillon 2024-02-29 413 u32 input_fw_va;
de85488138247d Boris Brezillon 2024-02-29 414
de85488138247d Boris Brezillon 2024-02-29 415 /** @output_fw_va: FW virtual address of the output interface buffer. */
de85488138247d Boris Brezillon 2024-02-29 416 u32 output_fw_va;
de85488138247d Boris Brezillon 2024-02-29 417 } iface;
de85488138247d Boris Brezillon 2024-02-29 418
de85488138247d Boris Brezillon 2024-02-29 419 /**
de85488138247d Boris Brezillon 2024-02-29 420 * @syncwait: Stores information about the synchronization object this
de85488138247d Boris Brezillon 2024-02-29 421 * queue is waiting on.
de85488138247d Boris Brezillon 2024-02-29 422 */
de85488138247d Boris Brezillon 2024-02-29 423 struct {
de85488138247d Boris Brezillon 2024-02-29 424 /** @gpu_va: GPU address of the synchronization object. */
de85488138247d Boris Brezillon 2024-02-29 425 u64 gpu_va;
de85488138247d Boris Brezillon 2024-02-29 426
de85488138247d Boris Brezillon 2024-02-29 427 /** @ref: Reference value to compare against. */
de85488138247d Boris Brezillon 2024-02-29 428 u64 ref;
de85488138247d Boris Brezillon 2024-02-29 429
de85488138247d Boris Brezillon 2024-02-29 430 /** @gt: True if this is a greater-than test. */
de85488138247d Boris Brezillon 2024-02-29 431 bool gt;
de85488138247d Boris Brezillon 2024-02-29 432
de85488138247d Boris Brezillon 2024-02-29 433 /** @sync64: True if this is a 64-bit sync object. */
de85488138247d Boris Brezillon 2024-02-29 434 bool sync64;
de85488138247d Boris Brezillon 2024-02-29 435
de85488138247d Boris Brezillon 2024-02-29 436 /** @bo: Buffer object holding the synchronization object. */
de85488138247d Boris Brezillon 2024-02-29 437 struct drm_gem_object *obj;
de85488138247d Boris Brezillon 2024-02-29 438
de85488138247d Boris Brezillon 2024-02-29 439 /** @offset: Offset of the synchronization object inside @bo. */
de85488138247d Boris Brezillon 2024-02-29 440 u64 offset;
de85488138247d Boris Brezillon 2024-02-29 441
de85488138247d Boris Brezillon 2024-02-29 442 /**
de85488138247d Boris Brezillon 2024-02-29 443 * @kmap: Kernel mapping of the buffer object holding the
de85488138247d Boris Brezillon 2024-02-29 444 * synchronization object.
de85488138247d Boris Brezillon 2024-02-29 445 */
de85488138247d Boris Brezillon 2024-02-29 446 void *kmap;
de85488138247d Boris Brezillon 2024-02-29 447 } syncwait;
de85488138247d Boris Brezillon 2024-02-29 448
de85488138247d Boris Brezillon 2024-02-29 449 /** @fence_ctx: Fence context fields. */
de85488138247d Boris Brezillon 2024-02-29 450 struct {
de85488138247d Boris Brezillon 2024-02-29 451 /** @lock: Used to protect access to all fences allocated by this context. */
de85488138247d Boris Brezillon 2024-02-29 452 spinlock_t lock;
de85488138247d Boris Brezillon 2024-02-29 453
de85488138247d Boris Brezillon 2024-02-29 454 /**
de85488138247d Boris Brezillon 2024-02-29 455 * @id: Fence context ID.
de85488138247d Boris Brezillon 2024-02-29 456 *
de85488138247d Boris Brezillon 2024-02-29 457 * Allocated with dma_fence_context_alloc().
de85488138247d Boris Brezillon 2024-02-29 458 */
de85488138247d Boris Brezillon 2024-02-29 459 u64 id;
de85488138247d Boris Brezillon 2024-02-29 460
de85488138247d Boris Brezillon 2024-02-29 461 /** @seqno: Sequence number of the last initialized fence. */
de85488138247d Boris Brezillon 2024-02-29 462 atomic64_t seqno;
de85488138247d Boris Brezillon 2024-02-29 463
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 464 /**
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 465 * @last_fence: Fence of the last submitted job.
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 466 *
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 467 * We return this fence when we get an empty command stream.
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 468 * This way, we are guaranteed that all earlier jobs have completed
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 469 * when drm_sched_job::s_fence::finished without having to feed
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 470 * the CS ring buffer with a dummy job that only signals the fence.
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 471 */
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 472 struct dma_fence *last_fence;
7b6f9ec6ad5112 Boris Brezillon 2024-07-03 473
de85488138247d Boris Brezillon 2024-02-29 474 /**
de85488138247d Boris Brezillon 2024-02-29 475 * @in_flight_jobs: List containing all in-flight jobs.
de85488138247d Boris Brezillon 2024-02-29 476 *
de85488138247d Boris Brezillon 2024-02-29 477 * Used to keep track and signal panthor_job::done_fence when the
de85488138247d Boris Brezillon 2024-02-29 478 * synchronization object attached to the queue is signaled.
de85488138247d Boris Brezillon 2024-02-29 479 */
de85488138247d Boris Brezillon 2024-02-29 480 struct list_head in_flight_jobs;
de85488138247d Boris Brezillon 2024-02-29 481 } fence_ctx;
6f64890b41a576 Adrián Larumbe 2024-09-03 482
6f64890b41a576 Adrián Larumbe 2024-09-03 483 /** @profiling_info: Job profiling data slots and access information. */
6f64890b41a576 Adrián Larumbe 2024-09-03 484 struct {
6f64890b41a576 Adrián Larumbe 2024-09-03 485 /** @slots: Kernel BO holding the slots. */
6f64890b41a576 Adrián Larumbe 2024-09-03 486 struct panthor_kernel_bo *slots;
6f64890b41a576 Adrián Larumbe 2024-09-03 487
6f64890b41a576 Adrián Larumbe 2024-09-03 488 /** @slot_count: Number of jobs ringbuffer can hold at once. */
6f64890b41a576 Adrián Larumbe 2024-09-03 489 u32 slot_count;
6f64890b41a576 Adrián Larumbe 2024-09-03 490
6f64890b41a576 Adrián Larumbe 2024-09-03 491 /** @profiling_seqno: Index of the next available profiling information slot. */
6f64890b41a576 Adrián Larumbe 2024-09-03 492 u32 profiling_seqno;
6f64890b41a576 Adrián Larumbe 2024-09-03 493 } profiling_info;
de85488138247d Boris Brezillon 2024-02-29 @494 };
de85488138247d Boris Brezillon 2024-02-29 495
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-03 20:25 ` [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting Adrián Larumbe
2024-09-04 7:49 ` Boris Brezillon
2024-09-04 16:10 ` kernel test robot
@ 2024-09-04 17:02 ` kernel test robot
2024-09-04 18:55 ` Liviu Dudau
3 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2024-09-04 17:02 UTC (permalink / raw)
To: Adrián Larumbe, Boris Brezillon, Steven Price, Liviu Dudau,
Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Daniel Vetter, Sumit Semwal, Christian König
Cc: oe-kbuild-all, kernel, Adrián Larumbe, dri-devel,
linux-kernel, linux-media, linaro-mm-sig
Hi Adrián,
kernel test robot noticed the following build errors:
[auto build test ERROR on drm-misc/drm-misc-next]
[also build test ERROR on linus/master v6.11-rc6 next-20240904]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Adri-n-Larumbe/drm-panthor-introduce-job-cycle-and-timestamp-accounting/20240904-042645
base: git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link: https://lore.kernel.org/r/20240903202541.430225-2-adrian.larumbe%40collabora.com
patch subject: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
config: arc-allmodconfig (https://download.01.org/0day-ci/archive/20240905/202409050054.oRwtzLQ4-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240905/202409050054.oRwtzLQ4-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409050054.oRwtzLQ4-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from <command-line>:
In function 'copy_instrs_to_ringbuf',
inlined from 'queue_run_job' at drivers/gpu/drm/panthor/panthor_sched.c:3089:2:
>> include/linux/compiler_types.h:510:45: error: call to '__compiletime_assert_435' declared with attribute error: min(ringbuf_size - start, size) signedness error
510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:491:25: note: in definition of macro '__compiletime_assert'
491 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:510:9: note: in expansion of macro '_compiletime_assert'
510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/minmax.h:100:9: note: in expansion of macro 'BUILD_BUG_ON_MSG'
100 | BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \
| ^~~~~~~~~~~~~~~~
include/linux/minmax.h:105:9: note: in expansion of macro '__careful_cmp_once'
105 | __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
| ^~~~~~~~~~~~~~~~~~
include/linux/minmax.h:129:25: note: in expansion of macro '__careful_cmp'
129 | #define min(x, y) __careful_cmp(min, x, y)
| ^~~~~~~~~~~~~
drivers/gpu/drm/panthor/panthor_sched.c:2882:19: note: in expansion of macro 'min'
2882 | written = min(ringbuf_size - start, size);
| ^~~
vim +/__compiletime_assert_435 +510 include/linux/compiler_types.h
eb5c2d4b45e3d2 Will Deacon 2020-07-21 496
eb5c2d4b45e3d2 Will Deacon 2020-07-21 497 #define _compiletime_assert(condition, msg, prefix, suffix) \
eb5c2d4b45e3d2 Will Deacon 2020-07-21 498 __compiletime_assert(condition, msg, prefix, suffix)
eb5c2d4b45e3d2 Will Deacon 2020-07-21 499
eb5c2d4b45e3d2 Will Deacon 2020-07-21 500 /**
eb5c2d4b45e3d2 Will Deacon 2020-07-21 501 * compiletime_assert - break build and emit msg if condition is false
eb5c2d4b45e3d2 Will Deacon 2020-07-21 502 * @condition: a compile-time constant condition to check
eb5c2d4b45e3d2 Will Deacon 2020-07-21 503 * @msg: a message to emit if condition is false
eb5c2d4b45e3d2 Will Deacon 2020-07-21 504 *
eb5c2d4b45e3d2 Will Deacon 2020-07-21 505 * In tradition of POSIX assert, this macro will break the build if the
eb5c2d4b45e3d2 Will Deacon 2020-07-21 506 * supplied condition is *false*, emitting the supplied error message if the
eb5c2d4b45e3d2 Will Deacon 2020-07-21 507 * compiler has support to do so.
eb5c2d4b45e3d2 Will Deacon 2020-07-21 508 */
eb5c2d4b45e3d2 Will Deacon 2020-07-21 509 #define compiletime_assert(condition, msg) \
eb5c2d4b45e3d2 Will Deacon 2020-07-21 @510 _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
eb5c2d4b45e3d2 Will Deacon 2020-07-21 511
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting
2024-09-03 20:25 ` [PATCH v5 1/4] drm/panthor: introduce job cycle and timestamp accounting Adrián Larumbe
` (2 preceding siblings ...)
2024-09-04 17:02 ` kernel test robot
@ 2024-09-04 18:55 ` Liviu Dudau
3 siblings, 0 replies; 15+ messages in thread
From: Liviu Dudau @ 2024-09-04 18:55 UTC (permalink / raw)
To: Adrián Larumbe
Cc: Boris Brezillon, Steven Price, Maarten Lankhorst, Maxime Ripard,
Thomas Zimmermann, David Airlie, Daniel Vetter, Sumit Semwal,
Christian König, kernel, dri-devel, linux-kernel,
linux-media, linaro-mm-sig
On Tue, Sep 03, 2024 at 09:25:35PM +0100, Adrián Larumbe wrote:
> Enable calculations of job submission times in clock cycles and wall
> time. This is done by expanding the boilerplate command stream when running
> a job to include instructions that compute said times right before an after
s/an/and/
> a user CS.
>
> A separate kernel BO is created per queue to store those values. Jobs can
> access their sampled data through a slots buffer-specific index different
s/slots/slot's/ ?
> from that of the queue's ringbuffer. The reason for this is saving memory
> on the profiling information kernel BO, since the amount of simultaneous
> profiled jobs we can write into the queue's ringbuffer might be much
> smaller than for regular jobs, as the former take more CSF instructions.
>
> This commit is done in preparation for enabling DRM fdinfo support in the
> Panthor driver, which depends on the numbers calculated herein.
>
> A profile mode mask has been added that will in a future commit allow UM to
> toggle performance metric sampling behaviour, which is disabled by default
> to save power. When a ringbuffer CS is constructed, timestamp and cycling
> sampling instructions are added depending on the enabled flags in the
> profiling mask.
>
> A helper was provided that calculates the number of instructions for a
> given set of enablement mask, and these are passed as the number of credits
> when initialising a DRM scheduler job.
>
> Signed-off-by: Adrián Larumbe <adrian.larumbe@collabora.com>
> ---
> drivers/gpu/drm/panthor/panthor_device.h | 22 ++
> drivers/gpu/drm/panthor/panthor_sched.c | 327 ++++++++++++++++++++---
> 2 files changed, 305 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> index e388c0472ba7..a48e30d0af30 100644
> --- a/drivers/gpu/drm/panthor/panthor_device.h
> +++ b/drivers/gpu/drm/panthor/panthor_device.h
> @@ -66,6 +66,25 @@ struct panthor_irq {
> atomic_t suspended;
> };
>
> +/**
> + * enum panthor_device_profiling_mode - Profiling state
> + */
> +enum panthor_device_profiling_flags {
> + /** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
> + PANTHOR_DEVICE_PROFILING_DISABLED = 0,
> +
> + /** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
> + PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
> +
> + /** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
> +
> + /** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
> + PANTHOR_DEVICE_PROFILING_ALL =
> + PANTHOR_DEVICE_PROFILING_CYCLES |
> + PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> +};
> +
> /**
> * struct panthor_device - Panthor device
> */
> @@ -162,6 +181,9 @@ struct panthor_device {
> */
> struct page *dummy_latest_flush;
> } pm;
> +
> + /** @profile_mask: User-set profiling flags for job accounting. */
> + u32 profile_mask;
> };
>
> /**
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index c426a392b081..b087648bf59a 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -93,6 +93,9 @@
> #define MIN_CSGS 3
> #define MAX_CSG_PRIO 0xf
>
> +#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
> +#define MAX_INSTRS_PER_JOB 32
> +
> struct panthor_group;
>
> /**
> @@ -476,6 +479,18 @@ struct panthor_queue {
> */
> struct list_head in_flight_jobs;
> } fence_ctx;
> +
> + /** @profiling_info: Job profiling data slots and access information. */
> + struct {
> + /** @slots: Kernel BO holding the slots. */
> + struct panthor_kernel_bo *slots;
> +
> + /** @slot_count: Number of jobs ringbuffer can hold at once. */
> + u32 slot_count;
> +
> + /** @profiling_seqno: Index of the next available profiling information slot. */
> + u32 profiling_seqno;
> + } profiling_info;
> };
>
> /**
> @@ -661,6 +676,18 @@ struct panthor_group {
> struct list_head wait_node;
> };
>
> +struct panthor_job_profiling_data {
> + struct {
> + u64 before;
> + u64 after;
> + } cycles;
> +
> + struct {
> + u64 before;
> + u64 after;
> + } time;
> +};
> +
> /**
> * group_queue_work() - Queue a group work
> * @group: Group to queue the work for.
> @@ -774,6 +801,12 @@ struct panthor_job {
>
> /** @done_fence: Fence signaled when the job is finished or cancelled. */
> struct dma_fence *done_fence;
> +
> + /** @profile_mask: Current device job profiling enablement bitmask. */
> + u32 profile_mask;
> +
> + /** @profile_slot: Job profiling information index in the profiling slots BO. */
> + u32 profiling_slot;
> };
>
> static void
> @@ -838,6 +871,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>
> panthor_kernel_bo_destroy(queue->ringbuf);
> panthor_kernel_bo_destroy(queue->iface.mem);
> + panthor_kernel_bo_destroy(queue->profiling_info.slots);
>
> /* Release the last_fence we were holding, if any. */
> dma_fence_put(queue->fence_ctx.last_fence);
> @@ -1982,8 +2016,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
> }
> }
>
> -#define NUM_INSTRS_PER_SLOT 16
> -
> static void
> group_term_post_processing(struct panthor_group *group)
> {
> @@ -2815,65 +2847,211 @@ static void group_sync_upd_work(struct work_struct *work)
> group_put(group);
> }
>
> -static struct dma_fence *
> -queue_run_job(struct drm_sched_job *sched_job)
> +struct panthor_job_ringbuf_instrs {
> + u64 buffer[MAX_INSTRS_PER_JOB];
> + u32 count;
> +};
> +
> +struct panthor_job_instr {
> + u32 profile_mask;
> + u64 instr;
> +};
> +
> +#define JOB_INSTR(__prof, __instr) \
> + { \
> + .profile_mask = __prof, \
> + .instr = __instr, \
> + }
> +
> +static void
> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
> + struct panthor_job *job,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + ssize_t ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> + u32 start = job->ringbuf.start & (ringbuf_size - 1);
> + ssize_t size, written;
> +
> + /*
> + * We need to write a whole slot, including any trailing zeroes
> + * that may come at the end of it. Also, because instrs.buffer had
s/had/has/
> + * been zero-initialised, there's no need to pad it with 0's
> + */
> + instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + size = instrs->count * sizeof(u64);
> + written = min(ringbuf_size - start, size);
> +
> + memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
> +
> + if (written < size)
> + memcpy(queue->ringbuf->kmap,
> + &instrs->buffer[(ringbuf_size - start)/sizeof(u64)],
> + size - written);
> +}
> +
> +struct panthor_job_cs_params {
> + u32 profile_mask;
> + u64 addr_reg; u64 val_reg;
> + u64 cycle_reg; u64 time_reg;
> + u64 sync_addr; u64 times_addr;
> + u64 cs_start; u64 cs_size;
> + u32 last_flush; u32 waitall_mask;
> +};
> +
> +static void
> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
> {
> - struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> struct panthor_group *group = job->group;
> struct panthor_queue *queue = group->queues[job->queue_idx];
> struct panthor_device *ptdev = group->ptdev;
> struct panthor_scheduler *sched = ptdev->scheduler;
> - u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
> - u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
> - u64 addr_reg = ptdev->csif_info.cs_reg_count -
> - ptdev->csif_info.unpreserved_cs_reg_count;
> - u64 val_reg = addr_reg + 2;
> - u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> - job->queue_idx * sizeof(struct panthor_syncobj_64b);
> - u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> - struct dma_fence *done_fence;
> - int ret;
>
> - u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
> + params->addr_reg = ptdev->csif_info.cs_reg_count -
> + ptdev->csif_info.unpreserved_cs_reg_count;
> + params->val_reg = params->addr_reg + 2;
> + params->cycle_reg = params->addr_reg;
> + params->time_reg = params->val_reg;
> +
> + params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
> + job->queue_idx * sizeof(struct panthor_syncobj_64b);
> + params->times_addr = panthor_kernel_bo_gpuva(queue->profiling_info.slots) +
> + (job->profiling_slot * sizeof(struct panthor_job_profiling_data));
> + params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
> +
> + params->cs_start = job->call_info.start;
> + params->cs_size = job->call_info.size;
> + params->last_flush = job->call_info.latest_flush;
> +
> + params->profile_mask = job->profile_mask;
> +}
> +
> +static void
> +prepare_job_instrs(const struct panthor_job_cs_params *params,
> + struct panthor_job_ringbuf_instrs *instrs)
> +{
> + const struct panthor_job_instr instr_seq[] = {
> /* MOV32 rX+2, cs.latest_flush */
> - (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (2ull << 56) | (params->val_reg << 48) | params->last_flush),
>
> /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
> - (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
> +
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, cycles.before))),
> + /* STORE_STATE cycles */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (1ull << 56) | (params->time_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, time.before))),
> + /* STORE_STATE timer */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>
> /* MOV48 rX:rX+1, cs.start */
> - (1ull << 56) | (addr_reg << 48) | job->call_info.start,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->addr_reg << 48) | params->cs_start),
> /* MOV32 rX+2, cs.size */
> - (2ull << 56) | (val_reg << 48) | job->call_info.size,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (2ull << 56) | (params->val_reg << 48) | params->cs_size),
> /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
> - (3ull << 56) | (1 << 16),
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (3ull << 56) | (1 << 16)),
> /* CALL rX:rX+1, rX+2 */
> - (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
> +
> + /* MOV48 rX:rX+1, cycles_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (1ull << 56) | (params->cycle_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, cycles.after))),
> + /* STORE_STATE cycles */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES,
> + (40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
> +
> + /* MOV48 rX:rX+1, time_offset */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (1ull << 56) | (params->time_reg << 48) |
> + (params->times_addr +
> + offsetof(struct panthor_job_profiling_data, time.after))),
> + /* STORE_STATE timer */
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP,
> + (40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>
> /* MOV48 rX:rX+1, sync_addr */
> - (1ull << 56) | (addr_reg << 48) | sync_addr,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
> /* MOV48 rX+2, #1 */
> - (1ull << 56) | (val_reg << 48) | 1,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (1ull << 56) | (params->val_reg << 48) | 1),
> /* WAIT(all) */
> - (3ull << 56) | (waitall_mask << 16),
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (3ull << 56) | (params->waitall_mask << 16)),
> /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
> - (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
> -
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED,
> + (51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
> + (params->val_reg << 32) | (0 << 16) | 1),
> /* ERROR_BARRIER, so we can recover from faults at job
> * boundaries.
> */
> - (47ull << 56),
> + JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (47ull << 56)),
> + };
> + u32 pad;
> +
> + /* NEED to be cacheline aligned to please the prefetcher. */
> + static_assert(sizeof(instrs->buffer) % 64 == 0,
> + "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
> +
> + /* Make sure we have enough storage to store the whole sequence. */
> + static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) <=
> + ARRAY_SIZE(instrs->buffer),
> + "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
> +
> + for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
> + /* If the profile mask of this instruction is not enabled, skip it. */
> + if (instr_seq[i].profile_mask &&
> + !(instr_seq[i].profile_mask & params->profile_mask))
> + continue;
> +
> + instrs->buffer[instrs->count++] = instr_seq[i].instr;
> + }
> +
> + pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
> + memset(&instrs->buffer[instrs->count], 0,
> + (pad - instrs->count) * sizeof(instrs->buffer[0]));
> + instrs->count = pad;
> +}
> +
> +static u32 calc_job_credits(u32 profile_mask)
> +{
> + struct panthor_job_ringbuf_instrs instrs;
> + struct panthor_job_cs_params params = {
> + .profile_mask = profile_mask,
> };
>
> - /* Need to be cacheline aligned to please the prefetcher. */
> - static_assert(sizeof(call_instrs) % 64 == 0,
> - "call_instrs is not aligned on a cacheline");
> + prepare_job_instrs(¶ms, &instrs);
> + return instrs.count;
> +}
> +
> +static struct dma_fence *
> +queue_run_job(struct drm_sched_job *sched_job)
> +{
> + struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
> + struct panthor_group *group = job->group;
> + struct panthor_queue *queue = group->queues[job->queue_idx];
> + struct panthor_device *ptdev = group->ptdev;
> + struct panthor_scheduler *sched = ptdev->scheduler;
> + struct panthor_job_ringbuf_instrs instrs;
> + struct panthor_job_cs_params cs_params;
> + struct dma_fence *done_fence;
> + int ret;
>
> /* Stream size is zero, nothing to do except making sure all previously
> * submitted jobs are done before we signal the
> @@ -2900,17 +3078,23 @@ queue_run_job(struct drm_sched_job *sched_job)
> queue->fence_ctx.id,
> atomic64_inc_return(&queue->fence_ctx.seqno));
>
> - memcpy(queue->ringbuf->kmap + ringbuf_insert,
> - call_instrs, sizeof(call_instrs));
> + job->profiling_slot = queue->profiling_info.profiling_seqno++;
> + if (queue->profiling_info.profiling_seqno == queue->profiling_info.slot_count)
> + queue->profiling_info.profiling_seqno = 0;
> +
> + job->ringbuf.start = queue->iface.input->insert;
> +
> + get_job_cs_params(job, &cs_params);
> + prepare_job_instrs(&cs_params, &instrs);
> + copy_instrs_to_ringbuf(queue, job, &instrs);
> +
> + job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>
> panthor_job_get(&job->base);
> spin_lock(&queue->fence_ctx.lock);
> list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
> spin_unlock(&queue->fence_ctx.lock);
>
> - job->ringbuf.start = queue->iface.input->insert;
> - job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
> -
> /* Make sure the ring buffer is updated before the INSERT
> * register.
> */
> @@ -3003,6 +3187,24 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
> .free_job = queue_free_job,
> };
>
> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
> + u32 cs_ringbuf_size)
> +{
> + u32 min_profiled_job_instrs = U32_MAX;
> + u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
> +
> + for (u32 i = 0; i < last_flag; i++) {
> + if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
> + min_profiled_job_instrs =
> + min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
> + }
> +
> + drm_WARN_ON(&ptdev->base,
> + !IS_ALIGNED(min_profiled_job_instrs, NUM_INSTRS_PER_CACHE_LINE));
> +
> + return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
> +}
> +
> static struct panthor_queue *
> group_create_queue(struct panthor_group *group,
> const struct drm_panthor_queue_create *args)
> @@ -3056,9 +3258,38 @@ group_create_queue(struct panthor_group *group,
> goto err_free_queue;
> }
>
> + queue->profiling_info.slot_count =
> + calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
> +
> + queue->profiling_info.slots =
> + panthor_kernel_bo_create(group->ptdev, group->vm,
> + queue->profiling_info.slot_count *
> + sizeof(struct panthor_job_profiling_data),
> + DRM_PANTHOR_BO_NO_MMAP,
> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
> + DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> + PANTHOR_VM_KERNEL_AUTO_VA);
> +
> + if (IS_ERR(queue->profiling_info.slots)) {
> + ret = PTR_ERR(queue->profiling_info.slots);
> + goto err_free_queue;
> + }
> +
> + ret = panthor_kernel_bo_vmap(queue->profiling_info.slots);
> + if (ret)
> + goto err_free_queue;
> +
> + memset(queue->profiling_info.slots->kmap, 0,
> + queue->profiling_info.slot_count * sizeof(struct panthor_job_profiling_data));
> +
> + /*
> + * Credit limit argument tells us the total number of instructions
> + * across all CS slots in the ringbuffer, with some jobs requiring
> + * twice as many as others, depending on their profiling status.
> + */
> ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
> group->ptdev->scheduler->wq, 1,
> - args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
> + args->ringbuf_size / sizeof(u64),
> 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
> group->ptdev->reset.wq,
> NULL, "panthor-queue", group->ptdev->base.dev);
> @@ -3354,6 +3585,7 @@ panthor_job_create(struct panthor_file *pfile,
> {
> struct panthor_group_pool *gpool = pfile->groups;
> struct panthor_job *job;
> + u32 credits;
> int ret;
>
> if (qsubmit->pad)
> @@ -3407,9 +3639,16 @@ panthor_job_create(struct panthor_file *pfile,
> }
> }
>
> + job->profile_mask = pfile->ptdev->profile_mask;
> + credits = calc_job_credits(job->profile_mask);
> + if (credits == 0) {
> + ret = -EINVAL;
> + goto err_put_job;
> + }
> +
> ret = drm_sched_job_init(&job->base,
> &job->group->queues[job->queue_idx]->entity,
> - 1, job->group);
> + credits, job->group);
> if (ret)
> goto err_put_job;
>
> --
> 2.46.0
>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Best regards,
Liviu
--
====================
| I would like to |
| fix the world, |
| but they're not |
| giving me the |
\ source code! /
---------------
¯\_(ツ)_/¯
^ permalink raw reply [flat|nested] 15+ messages in thread