Re: [PATCH 2/2] drm/xe: Waste fewer instructions in emit_wa_job()

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Raag Jadav <raag.jadav@intel.com>
To: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Cc: intel-xe@lists.freedesktop.org, kernel-dev@igalia.com
Subject: Re: [PATCH 2/2] drm/xe: Waste fewer instructions in emit_wa_job()
Date: Fri, 27 Jun 2025 16:20:10 +0300	[thread overview]
Message-ID: <aF6aigpUviiv0OXU@black.fi.intel.com> (raw)
In-Reply-To: <20250627131438.54398-3-tvrtko.ursulin@igalia.com>

On Fri, Jun 27, 2025 at 02:14:38PM +0100, Tvrtko Ursulin wrote:
> I was debugging some unrelated issue and noticed the current code was
> very verbose. We can improve it easily by using the more common batch
> buffer building pattern.
> 
> Before:
> 
>                         bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
>      c4d:       41 8b 56 10             mov    0x10(%r14),%edx
>      c51:       49 8b 4e 08             mov    0x8(%r14),%rcx
>      c55:       8d 72 01                lea    0x1(%rdx),%esi
>      c58:       41 89 76 10             mov    %esi,0x10(%r14)
>      c5c:       c7 04 91 01 00 08 15    movl   $0x15080001,(%rcx,%rdx,4)
>                         bb->cs[bb->len++] = entry->reg.addr;
>      c63:       8b 08                   mov    (%rax),%ecx
>      c65:       41 8b 56 10             mov    0x10(%r14),%edx
>      c69:       49 8b 76 08             mov    0x8(%r14),%rsi
>      c6d:       81 e1 ff ff 3f 00       and    $0x3fffff,%ecx
>      c73:       8d 7a 01                lea    0x1(%rdx),%edi
>      c76:       41 89 7e 10             mov    %edi,0x10(%r14)
>      c7a:       89 0c 96                mov    %ecx,(%rsi,%rdx,4)
>  ..etc..
> 
> After:
> 
>                              *cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
>      c52:       41 c7 04 24 01 00 08    movl   $0x15080001,(%r12)
>      c59:       15
>                         *cs++ = entry->reg.addr;
>      c5a:       8b 10                   mov    (%rax),%edx
>  ..etc..

Use bloat-o-meter. It usually provides nice summary.

Raag

> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
> ---
>  drivers/gpu/drm/xe/xe_gt.c  | 77 ++++++++++++++++++++-----------------
>  drivers/gpu/drm/xe/xe_lrc.c | 12 +++---
>  drivers/gpu/drm/xe/xe_lrc.h |  2 +-
>  3 files changed, 49 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 86018fee74d3..a5b0db43bcee 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -182,19 +182,21 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  {
>  	struct xe_reg_sr *sr = &q->hwe->reg_lrc;
>  	struct xe_reg_sr_entry *entry;
> +	int count = 0, count_rmw = 0;
> +	struct xe_sched_job *job;
> +	struct dma_fence *fence;
>  	unsigned long idx;
> -	struct xe_sched_job *job;
>  	struct xe_bb *bb;
> -	struct dma_fence *fence;
>  	long timeout;
> -	int count_rmw = 0;
> -	int count = 0;
> +	u32 *cs;
>  
>  	/* Just pick a large BB size */
>  	bb = xe_bb_new(gt, xe_gt_lrc_size(gt, q->hwe->class), false);
>  	if (IS_ERR(bb))
>  		return PTR_ERR(bb);
>  
> +	cs = bb->cs;
> +
>  	/* count RMW registers as those will be handled separately */
>  	xa_for_each(&sr->xa, idx, entry) {
>  		if (entry->reg.masked || entry->clr_bits == ~0)
> @@ -209,7 +211,7 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  	if (count) {
>  		/* emit single LRI with all non RMW regs */
>  
> -		bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
> +		*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
>  
>  		xa_for_each(&sr->xa, idx, entry) {
>  			struct xe_reg reg = entry->reg;
> @@ -224,8 +226,8 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  
>  			val |= entry->set_bits;
>  
> -			bb->cs[bb->len++] = reg.addr;
> -			bb->cs[bb->len++] = val;
> +			*cs++ = reg.addr;
> +			*cs++ = val;
>  			xe_gt_dbg(gt, "REG[0x%x] = 0x%08x", reg.addr, val);
>  		}
>  	}
> @@ -237,46 +239,49 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q)
>  			if (entry->reg.masked || entry->clr_bits == ~0)
>  				continue;
>  
> -			bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
> -			bb->cs[bb->len++] = entry->reg.addr;
> -			bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> +			*cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO;
> +			*cs++ = entry->reg.addr;
> +			*cs++ = CS_GPR_REG(0, 0).addr;
>  
> -			bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
> -					    MI_LRI_LRM_CS_MMIO;
> -			bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
> -			bb->cs[bb->len++] = entry->clr_bits;
> -			bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
> -			bb->cs[bb->len++] = entry->set_bits;
> +			*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
> +				MI_LRI_LRM_CS_MMIO;
> +			*cs++ = CS_GPR_REG(0, 1).addr;
> +			*cs++ = entry->clr_bits;
> +			*cs++ = CS_GPR_REG(0, 2).addr;
> +			*cs++ = entry->set_bits;
>  
> -			bb->cs[bb->len++] = MI_MATH(8);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_LOADINV(SRCB, REG1);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_AND;
> -			bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCB, REG2);
> -			bb->cs[bb->len++] = CS_ALU_INSTR_OR;
> -			bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU);
> +			*cs++ = MI_MATH(8);
> +			*cs++ = CS_ALU_INSTR_LOAD(SRCA, REG0);
> +			*cs++ = CS_ALU_INSTR_LOADINV(SRCB, REG1);
> +			*cs++ = CS_ALU_INSTR_AND;
> +			*cs++ = CS_ALU_INSTR_STORE(REG0, ACCU);
> +			*cs++ = CS_ALU_INSTR_LOAD(SRCA, REG0);
> +			*cs++ = CS_ALU_INSTR_LOAD(SRCB, REG2);
> +			*cs++ = CS_ALU_INSTR_OR;
> +			*cs++ = CS_ALU_INSTR_STORE(REG0, ACCU);
>  
> -			bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO;
> -			bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> -			bb->cs[bb->len++] = entry->reg.addr;
> +			*cs++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO;
> +			*cs++ = CS_GPR_REG(0, 0).addr;
> +			*cs++ = entry->reg.addr;
>  
>  			xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n",
>  				  entry->reg.addr, entry->clr_bits, entry->set_bits);
>  		}
>  
>  		/* reset used GPR */
> -		bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) | MI_LRI_LRM_CS_MMIO;
> -		bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr;
> -		bb->cs[bb->len++] = 0;
> -		bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr;
> -		bb->cs[bb->len++] = 0;
> -		bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr;
> -		bb->cs[bb->len++] = 0;
> +		*cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) |
> +			MI_LRI_LRM_CS_MMIO;
> +		*cs++ = CS_GPR_REG(0, 0).addr;
> +		*cs++ = 0;
> +		*cs++ = CS_GPR_REG(0, 1).addr;
> +		*cs++ = 0;
> +		*cs++ = CS_GPR_REG(0, 2).addr;
> +		*cs++ = 0;
>  	}
>  
> -	xe_lrc_emit_hwe_state_instructions(q, bb);
> +	cs = xe_lrc_emit_hwe_state_instructions(q, cs);
> +
> +	bb->len = cs - bb->cs;
>  
>  	job = xe_bb_create_job(q, bb);
>  	if (IS_ERR(job)) {
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 37598588a54f..8780ea1340d0 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -1775,7 +1775,7 @@ static const struct instr_state xe_hpg_svg_state[] = {
>  	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
>  };
>  
> -void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
> +u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
>  {
>  	struct xe_gt *gt = q->hwe->gt;
>  	struct xe_device *xe = gt_to_xe(gt);
> @@ -1810,7 +1810,7 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
>  	if (!state_table) {
>  		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
>  			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
> -		return;
> +		return cs;
>  	}
>  
>  	for (int i = 0; i < state_table_size; i++) {
> @@ -1833,12 +1833,14 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
>  		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
>  			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
>  
> -		bb->cs[bb->len] = instr;
> +		*cs = instr;
>  		if (!is_single_dw)
> -			bb->cs[bb->len] |= (num_dw - 2);
> +			*cs |= (num_dw - 2);
>  
> -		bb->len += num_dw;
> +		cs += num_dw;
>  	}
> +
> +	return cs;
>  }
>  
>  struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index eb6e8de8c939..b6c8053c581b 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -112,7 +112,7 @@ void xe_lrc_dump_default(struct drm_printer *p,
>  			 struct xe_gt *gt,
>  			 enum xe_engine_class);
>  
> -void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb);
> +u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs);
>  
>  struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc);
>  void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot);
> -- 
> 2.48.0
>

next prev parent reply	other threads:[~2025-06-27 13:20 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-27 13:14 [PATCH 0/2] Small emit_wa_job() tidy Tvrtko Ursulin
2025-06-27 13:14 ` [PATCH 1/2] drm/xe: Simplify batch buffer allocation in emit_wa_job() Tvrtko Ursulin
2025-06-27 22:47   ` Lucas De Marchi
2025-06-30  7:44     ` Tvrtko Ursulin
2025-06-27 13:14 ` [PATCH 2/2] drm/xe: Waste fewer instructions " Tvrtko Ursulin
2025-06-27 13:20   ` Raag Jadav [this message]
2025-06-30 13:39     ` Tvrtko Ursulin
2025-06-30 14:12 ` ✗ CI.checkpatch: warning for Small emit_wa_job() tidy Patchwork
2025-06-30 14:13 ` ✓ CI.KUnit: success " Patchwork
2025-06-30 14:48 ` ✓ Xe.CI.BAT: " Patchwork
2025-07-01 15:36 ` ✓ Xe.CI.Full: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aF6aigpUviiv0OXU@black.fi.intel.com \
    --to=raag.jadav@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=kernel-dev@igalia.com \
    --cc=tvrtko.ursulin@igalia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox