From: Matthew Brost <matthew.brost@intel.com>
To: Matthew Auld <matthew.auld@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH v3 6/7] drm/xe/migrate: support MEM_COPY instruction
Date: Wed, 22 Oct 2025 10:53:42 -0700 [thread overview]
Message-ID: <aPkaJnsZmaIXF6rO@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <20251022163836.191405-7-matthew.auld@intel.com>
On Wed, Oct 22, 2025 at 05:38:35PM +0100, Matthew Auld wrote:
> Make this the default on xe2+ when doing a copy. This has a few
> advantages over the exiting copy instruction:
>
> 1) It has a special PAGE_COPY mode that claims to be optimised for
> page-in/page-out, which is the vast majority of current users.
>
> 2) It also has a simple BYTE_COPY mode that supports byte granularity
> copying without any restrictions.
>
> With 2) we can now easily skip the bounce buffer flow when copying
> buffers with strange sizing/alignment, like for memory_access. But that
> is left for the next patch.
>
> v2 (Matt Brost):
> - Use device info to check whether device should use the MEM_COPY
> path. This should fit better with making this a configfs tunable.
> - And with that also keep old path still functional on xe2 for possible
> experimentation.
> - Add a define for PAGE_COPY page-size.
> v3 (Matt Brost):
> - Fallback to an actual linear copy for pitch=1.
> - Also update NVL.
>
> BSpec: 57561
> Signed-off-by: Matthew Auld <matthew.auld@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
> ---
> .../gpu/drm/xe/instructions/xe_gpu_commands.h | 6 ++
> drivers/gpu/drm/xe/xe_device_types.h | 2 +
> drivers/gpu/drm/xe/xe_migrate.c | 61 ++++++++++++++++++-
> drivers/gpu/drm/xe/xe_pci.c | 5 ++
> drivers/gpu/drm/xe/xe_pci_types.h | 1 +
> 5 files changed, 72 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h b/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
> index 8cfcd3360896..5d41ca297447 100644
> --- a/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
> +++ b/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
> @@ -31,6 +31,12 @@
> #define XY_FAST_COPY_BLT_D1_DST_TILE4 REG_BIT(30)
> #define XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK GENMASK(23, 20)
>
> +#define MEM_COPY_CMD (2 << 29 | 0x5a << 22 | 0x8)
> +#define MEM_COPY_PAGE_COPY_MODE REG_BIT(19)
> +#define MEM_COPY_MATRIX_COPY REG_BIT(17)
> +#define MEM_COPY_SRC_MOCS_INDEX_MASK GENMASK(31, 28)
> +#define MEM_COPY_DST_MOCS_INDEX_MASK GENMASK(6, 3)
> +
> #define PVC_MEM_SET_CMD (2 << 29 | 0x5b << 22)
> #define PVC_MEM_SET_CMD_LEN_DW 7
> #define PVC_MEM_SET_MATRIX REG_BIT(17)
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 02c04ad7296e..6a62b520f5b5 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -305,6 +305,8 @@ struct xe_device {
> * pcode mailbox commands.
> */
> u8 has_mbx_power_limits:1;
> + /** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
> + u8 has_mem_copy_instr:1;
> /** @info.has_pxp: Device has PXP support */
> u8 has_pxp:1;
> /** @info.has_range_tlb_inval: Has range based TLB invalidations */
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 95aefe2e71f5..1bbc7bca33ed 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -699,9 +699,9 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
> }
>
> #define EMIT_COPY_DW 10
> -static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> - u64 src_ofs, u64 dst_ofs, unsigned int size,
> - unsigned int pitch)
> +static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
> + u64 dst_ofs, unsigned int size,
> + unsigned int pitch)
> {
> struct xe_device *xe = gt_to_xe(gt);
> u32 mocs = 0;
> @@ -730,6 +730,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> bb->cs[bb->len++] = upper_32_bits(src_ofs);
> }
>
> +#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
> +static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
> + u64 dst_ofs, unsigned int size, unsigned int pitch)
> +{
> + u32 mode, copy_type, width;
> +
> + xe_gt_assert(gt, IS_ALIGNED(size, pitch));
> + xe_gt_assert(gt, pitch <= U16_MAX);
> + xe_gt_assert(gt, pitch);
> + xe_gt_assert(gt, size);
> +
> + if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
> + IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
> + IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
> + mode = MEM_COPY_PAGE_COPY_MODE;
> + copy_type = 0; /* linear copy */
> + width = size / PAGE_COPY_MODE_PS;
> + } else if (pitch > 1) {
> + xe_gt_assert(gt, size / pitch <= U16_MAX);
> + mode = 0; /* BYTE_COPY */
> + copy_type = MEM_COPY_MATRIX_COPY;
> + width = pitch;
> + } else {
> + mode = 0; /* BYTE_COPY */
> + copy_type = 0; /* linear copy */
> + width = size;
> + }
> +
> + xe_gt_assert(gt, width <= U16_MAX);
> +
> + bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
> + bb->cs[bb->len++] = width - 1;
> + bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
> + bb->cs[bb->len++] = pitch - 1;
> + bb->cs[bb->len++] = pitch - 1;
> + bb->cs[bb->len++] = lower_32_bits(src_ofs);
> + bb->cs[bb->len++] = upper_32_bits(src_ofs);
> + bb->cs[bb->len++] = lower_32_bits(dst_ofs);
> + bb->cs[bb->len++] = upper_32_bits(dst_ofs);
> + bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
> + FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
> +}
> +
> +static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
> + u64 src_ofs, u64 dst_ofs, unsigned int size,
> + unsigned int pitch)
> +{
> + struct xe_device *xe = gt_to_xe(gt);
> +
> + if (xe->info.has_mem_copy_instr)
> + emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
> + else
> + emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
> +}
> +
> static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
> {
> return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index c3136141a953..c9f07fac48b9 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -342,6 +342,7 @@ static const struct xe_device_desc lnl_desc = {
> .has_display = true,
> .has_flat_ccs = 1,
> .has_pxp = true,
> + .has_mem_copy_instr = true,
> .max_gt_per_tile = 2,
> .needs_scratch = true,
> .va_bits = 48,
> @@ -362,6 +363,7 @@ static const struct xe_device_desc bmg_desc = {
> .has_heci_cscfi = 1,
> .has_late_bind = true,
> .has_sriov = true,
> + .has_mem_copy_instr = true,
> .max_gt_per_tile = 2,
> .needs_scratch = true,
> .subplatforms = (const struct xe_subplatform_desc[]) {
> @@ -378,6 +380,7 @@ static const struct xe_device_desc ptl_desc = {
> .has_display = true,
> .has_flat_ccs = 1,
> .has_sriov = true,
> + .has_mem_copy_instr = true,
> .max_gt_per_tile = 2,
> .needs_scratch = true,
> .needs_shared_vf_gt_wq = true,
> @@ -390,6 +393,7 @@ static const struct xe_device_desc nvls_desc = {
> .dma_mask_size = 46,
> .has_display = true,
> .has_flat_ccs = 1,
> + .has_mem_copy_instr = true,
> .max_gt_per_tile = 2,
> .require_force_probe = true,
> .va_bits = 48,
> @@ -657,6 +661,7 @@ static int xe_info_init_early(struct xe_device *xe,
> xe->info.has_pxp = desc->has_pxp;
> xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) &&
> desc->has_sriov;
> + xe->info.has_mem_copy_instr = desc->has_mem_copy_instr;
> xe->info.skip_guc_pc = desc->skip_guc_pc;
> xe->info.skip_mtcfg = desc->skip_mtcfg;
> xe->info.skip_pcode = desc->skip_pcode;
> diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
> index a4451bdc79fb..9892c063a9c5 100644
> --- a/drivers/gpu/drm/xe/xe_pci_types.h
> +++ b/drivers/gpu/drm/xe/xe_pci_types.h
> @@ -46,6 +46,7 @@ struct xe_device_desc {
> u8 has_late_bind:1;
> u8 has_llc:1;
> u8 has_mbx_power_limits:1;
> + u8 has_mem_copy_instr:1;
> u8 has_pxp:1;
> u8 has_sriov:1;
> u8 needs_scratch:1;
> --
> 2.51.0
>
next prev parent reply other threads:[~2025-10-22 17:53 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-22 16:38 [PATCH v3 0/7] Some migration fixes/improvements Matthew Auld
2025-10-22 16:38 ` [PATCH v3 1/7] drm/xe/migrate: fix offset and len check Matthew Auld
2025-10-22 17:33 ` Matthew Brost
2025-10-22 16:38 ` [PATCH v3 2/7] drm/xe/migrate: rework size restrictions for sram pte emit Matthew Auld
2025-10-22 16:38 ` [PATCH v3 3/7] drm/xe/migrate: fix chunk handling for 2M page emit Matthew Auld
2025-10-22 16:38 ` [PATCH v3 4/7] drm/xe/migrate: fix batch buffer sizing Matthew Auld
2025-10-22 16:38 ` [PATCH v3 5/7] drm/xe/migrate: trim " Matthew Auld
2025-10-22 16:38 ` [PATCH v3 6/7] drm/xe/migrate: support MEM_COPY instruction Matthew Auld
2025-10-22 17:53 ` Matthew Brost [this message]
2025-10-22 16:38 ` [PATCH v3 7/7] drm/xe/migrate: skip bounce buffer path on xe2 Matthew Auld
2025-10-22 17:59 ` Matthew Brost
2025-10-23 0:16 ` ✓ CI.KUnit: success for Some migration fixes/improvements (rev3) Patchwork
2025-10-23 0:58 ` ✓ Xe.CI.BAT: " Patchwork
2025-10-23 7:31 ` ✗ Xe.CI.Full: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aPkaJnsZmaIXF6rO@lstrano-desk.jf.intel.com \
--to=matthew.brost@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox