[PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
@ 2023-12-15 12:54 Thomas Hellström
  2023-12-15 12:57 ` ✗ CI.Patch_applied: failure for " Patchwork
  2023-12-15 23:40 ` [PATCH] " Matt Roper
  0 siblings, 2 replies; 4+ messages in thread
From: Thomas Hellström @ 2023-12-15 12:54 UTC (permalink / raw)
  To: intel-xe

Since the migrate code is using the identity map for addressing VRAM,
copy chunks may become as small as 64K if the VRAM resource is fragmented.

However, a chunk size smaller that 1MiB may lead to the *next* chunk's
offset into the CCS metadata backup memory may not be page-aligned, and
the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could,
the current code doesn't handle the offset calculaton correctly.

To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If
the remaining data to copy is smaller than that, that's not a problem,
so use the remaining size. If the VRAM copy cunk becomes fragmented due
to the size alignment restriction, don't use the identity map, but instead
emit PTEs into the page-table like we do for system memory.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/tests/xe_migrate.c |  2 +-
 drivers/gpu/drm/xe/xe_migrate.c       | 67 ++++++++++++++++-----------
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
index 47fcd6e6b777..5f5b416dc88c 100644
--- a/drivers/gpu/drm/xe/tests/xe_migrate.c
+++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
@@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
 		xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
 
 	emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt),
-		 &src_it, XE_PAGE_SIZE, pt);
+		 &src_it, XE_PAGE_SIZE, pt->ttm.resource);
 
 	run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
 
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index 2ca927f3fb2a..0b8a33116322 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -411,14 +411,31 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
 
 static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur)
 {
-	/*
-	 * For VRAM we use identity mapped pages so we are limited to current
-	 * cursor size. For system we program the pages ourselves so we have no
-	 * such limitation.
-	 */
-	return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER,
-		     mem_type_is_vram(cur->mem_type) ? cur->size :
-		     cur->remaining);
+	u64 size = min_t(u64, MAX_PREEMPTDISABLE_TRANSFER, cur->remaining);
+
+	if (mem_type_is_vram(cur->mem_type)) {
+		/*
+		 * VRAM we want to blit in chunks with sizes aligned to
+		 * 1MiB in order for the offset to CCS metadata to be
+		 * page-aligned. If it's the last chunk it may be smaller.
+		 *
+		 * Another constraint is that we need to limit the blit to
+		 * the VRAM block size, unless size is smaller than 1MiB.
+		 */
+		u64 chunk = max_t(u64, cur->size, SZ_1M);
+
+		size = min_t(u64, size, chunk);
+		if (size > SZ_1M)
+			size = round_down(size, SZ_1M);
+	}
+
+	return size;
+}
+
+static bool xe_migrate_avoid_identity(u64 size, const struct xe_res_cursor *cur)
+{
+	/* The chunk is fragmented. Hence can't use identity map. */
+	return cur->size < size;
 }
 
 static u32 pte_update_size(struct xe_migrate *m,
@@ -431,7 +448,7 @@ static u32 pte_update_size(struct xe_migrate *m,
 	u32 cmds = 0;
 
 	*L0_pt = pt_ofs;
-	if (!is_vram) {
+	if (!is_vram || xe_migrate_avoid_identity(*L0, cur)) {
 		/* Clip L0 to available size */
 		u64 size = min(*L0, (u64)avail_pts * SZ_2M);
 		u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
@@ -461,20 +478,13 @@ static void emit_pte(struct xe_migrate *m,
 		     struct xe_bb *bb, u32 at_pt,
 		     bool is_vram,
 		     struct xe_res_cursor *cur,
-		     u32 size, struct xe_bo *bo)
+		     u32 size, struct ttm_resource *res)
 {
 	u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
 	u32 ptes;
 	u64 ofs = at_pt * XE_PAGE_SIZE;
 	u64 cur_ofs;
 
-	/*
-	 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
-	 * we're only emitting VRAM PTEs during sanity tests, so when
-	 * that's moved to a Kunit test, we should condition VRAM PTEs
-	 * on running tests.
-	 */
-
 	ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
 
 	while (ptes) {
@@ -498,10 +508,10 @@ static void emit_pte(struct xe_migrate *m,
 				if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
 				    !(cur_ofs & (16 * 8 - 1))) {
 					xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
-					flags |= XE_PTE_PS64;
 				}
 
-				addr += vram_region_gpu_offset(bo->ttm.resource);
+				addr += vram_region_gpu_offset(res);
+				flags |= XE_PTE_PS64;
 				devmem = true;
 			}
 
@@ -730,6 +740,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 						      &ccs_ofs, &ccs_pt, 0,
 						      2 * NUM_PT_PER_BLIT,
 						      NUM_PT_PER_BLIT);
+			xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
 		}
 
 		/* Add copy commands size here */
@@ -742,20 +753,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 			goto err_sync;
 		}
 
-		if (!src_is_vram)
+		if (!src_is_vram || xe_migrate_avoid_identity(src_L0, &src_it))
 			emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
-				 src_bo);
+				 src);
 		else
 			xe_res_next(&src_it, src_L0);
 
-		if (!dst_is_vram)
+		if (!dst_is_vram || xe_migrate_avoid_identity(src_L0, &dst_it))
 			emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
-				 dst_bo);
+				 dst);
 		else
 			xe_res_next(&dst_it, src_L0);
 
 		if (copy_system_ccs)
-			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src_bo);
+			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src);
 
 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 		update_idx = bb->len;
@@ -984,12 +995,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
 		size -= clear_L0;
 
 		/* Preemption is enabled again by the ring ops. */
-		if (!clear_vram) {
+		if (!clear_vram || xe_migrate_avoid_identity(clear_L0, &src_it))
 			emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
-				 bo);
-		} else {
+				 dst);
+		else
 			xe_res_next(&src_it, clear_L0);
-		}
+
 		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 		update_idx = bb->len;
 
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* ✗ CI.Patch_applied: failure for drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
  2023-12-15 12:54 [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks Thomas Hellström
@ 2023-12-15 12:57 ` Patchwork
  2023-12-15 23:40 ` [PATCH] " Matt Roper
  1 sibling, 0 replies; 4+ messages in thread
From: Patchwork @ 2023-12-15 12:57 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

== Series Details ==

Series: drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
URL   : https://patchwork.freedesktop.org/series/127874/
State : failure

== Summary ==

=== Applying kernel patches on branch 'drm-xe-next' with base: ===
Base commit: 6154a8abd drm/xe: Drop some unnecessary header includes
=== git am output follows ===
error: patch failed: drivers/gpu/drm/xe/tests/xe_migrate.c:331
error: drivers/gpu/drm/xe/tests/xe_migrate.c: patch does not apply
error: patch failed: drivers/gpu/drm/xe/xe_migrate.c:411
error: drivers/gpu/drm/xe/xe_migrate.c: patch does not apply
hint: Use 'git am --show-current-patch' to see the failed patch
Applying: drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
Patch failed at 0001 drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
  2023-12-15 12:54 [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks Thomas Hellström
  2023-12-15 12:57 ` ✗ CI.Patch_applied: failure for " Patchwork
@ 2023-12-15 23:40 ` Matt Roper
  2023-12-16 21:50   ` Thomas Hellström
  1 sibling, 1 reply; 4+ messages in thread
From: Matt Roper @ 2023-12-15 23:40 UTC (permalink / raw)
  To: Thomas Hellström; +Cc: intel-xe

On Fri, Dec 15, 2023 at 01:54:36PM +0100, Thomas Hellström wrote:
> Since the migrate code is using the identity map for addressing VRAM,
> copy chunks may become as small as 64K if the VRAM resource is fragmented.
> 
> However, a chunk size smaller that 1MiB may lead to the *next* chunk's
> offset into the CCS metadata backup memory may not be page-aligned, and
> the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could,
> the current code doesn't handle the offset calculaton correctly.
> 
> To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If

Does this need to be device-specific (derived from
NUM_BYTES_PER_CCS_BYTE)?  On DG2 the main:ccs ratio is 256:1, but on LNL
(and presumably future platforms) it's 512:1.


Matt

> the remaining data to copy is smaller than that, that's not a problem,
> so use the remaining size. If the VRAM copy cunk becomes fragmented due
> to the size alignment restriction, don't use the identity map, but instead
> emit PTEs into the page-table like we do for system memory.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  drivers/gpu/drm/xe/tests/xe_migrate.c |  2 +-
>  drivers/gpu/drm/xe/xe_migrate.c       | 67 ++++++++++++++++-----------
>  2 files changed, 40 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
> index 47fcd6e6b777..5f5b416dc88c 100644
> --- a/drivers/gpu/drm/xe/tests/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
> @@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
>  		xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
>  
>  	emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt),
> -		 &src_it, XE_PAGE_SIZE, pt);
> +		 &src_it, XE_PAGE_SIZE, pt->ttm.resource);
>  
>  	run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
>  
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 2ca927f3fb2a..0b8a33116322 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -411,14 +411,31 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
>  
>  static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur)
>  {
> -	/*
> -	 * For VRAM we use identity mapped pages so we are limited to current
> -	 * cursor size. For system we program the pages ourselves so we have no
> -	 * such limitation.
> -	 */
> -	return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER,
> -		     mem_type_is_vram(cur->mem_type) ? cur->size :
> -		     cur->remaining);
> +	u64 size = min_t(u64, MAX_PREEMPTDISABLE_TRANSFER, cur->remaining);
> +
> +	if (mem_type_is_vram(cur->mem_type)) {
> +		/*
> +		 * VRAM we want to blit in chunks with sizes aligned to
> +		 * 1MiB in order for the offset to CCS metadata to be
> +		 * page-aligned. If it's the last chunk it may be smaller.
> +		 *
> +		 * Another constraint is that we need to limit the blit to
> +		 * the VRAM block size, unless size is smaller than 1MiB.
> +		 */
> +		u64 chunk = max_t(u64, cur->size, SZ_1M);
> +
> +		size = min_t(u64, size, chunk);
> +		if (size > SZ_1M)
> +			size = round_down(size, SZ_1M);
> +	}
> +
> +	return size;
> +}
> +
> +static bool xe_migrate_avoid_identity(u64 size, const struct xe_res_cursor *cur)
> +{
> +	/* The chunk is fragmented. Hence can't use identity map. */
> +	return cur->size < size;
>  }
>  
>  static u32 pte_update_size(struct xe_migrate *m,
> @@ -431,7 +448,7 @@ static u32 pte_update_size(struct xe_migrate *m,
>  	u32 cmds = 0;
>  
>  	*L0_pt = pt_ofs;
> -	if (!is_vram) {
> +	if (!is_vram || xe_migrate_avoid_identity(*L0, cur)) {
>  		/* Clip L0 to available size */
>  		u64 size = min(*L0, (u64)avail_pts * SZ_2M);
>  		u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
> @@ -461,20 +478,13 @@ static void emit_pte(struct xe_migrate *m,
>  		     struct xe_bb *bb, u32 at_pt,
>  		     bool is_vram,
>  		     struct xe_res_cursor *cur,
> -		     u32 size, struct xe_bo *bo)
> +		     u32 size, struct ttm_resource *res)
>  {
>  	u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
>  	u32 ptes;
>  	u64 ofs = at_pt * XE_PAGE_SIZE;
>  	u64 cur_ofs;
>  
> -	/*
> -	 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
> -	 * we're only emitting VRAM PTEs during sanity tests, so when
> -	 * that's moved to a Kunit test, we should condition VRAM PTEs
> -	 * on running tests.
> -	 */
> -
>  	ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>  
>  	while (ptes) {
> @@ -498,10 +508,10 @@ static void emit_pte(struct xe_migrate *m,
>  				if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
>  				    !(cur_ofs & (16 * 8 - 1))) {
>  					xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
> -					flags |= XE_PTE_PS64;
>  				}
>  
> -				addr += vram_region_gpu_offset(bo->ttm.resource);
> +				addr += vram_region_gpu_offset(res);
> +				flags |= XE_PTE_PS64;
>  				devmem = true;
>  			}
>  
> @@ -730,6 +740,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>  						      &ccs_ofs, &ccs_pt, 0,
>  						      2 * NUM_PT_PER_BLIT,
>  						      NUM_PT_PER_BLIT);
> +			xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
>  		}
>  
>  		/* Add copy commands size here */
> @@ -742,20 +753,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>  			goto err_sync;
>  		}
>  
> -		if (!src_is_vram)
> +		if (!src_is_vram || xe_migrate_avoid_identity(src_L0, &src_it))
>  			emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
> -				 src_bo);
> +				 src);
>  		else
>  			xe_res_next(&src_it, src_L0);
>  
> -		if (!dst_is_vram)
> +		if (!dst_is_vram || xe_migrate_avoid_identity(src_L0, &dst_it))
>  			emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
> -				 dst_bo);
> +				 dst);
>  		else
>  			xe_res_next(&dst_it, src_L0);
>  
>  		if (copy_system_ccs)
> -			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src_bo);
> +			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src);
>  
>  		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>  		update_idx = bb->len;
> @@ -984,12 +995,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>  		size -= clear_L0;
>  
>  		/* Preemption is enabled again by the ring ops. */
> -		if (!clear_vram) {
> +		if (!clear_vram || xe_migrate_avoid_identity(clear_L0, &src_it))
>  			emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
> -				 bo);
> -		} else {
> +				 dst);
> +		else
>  			xe_res_next(&src_it, clear_L0);
> -		}
> +
>  		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>  		update_idx = bb->len;
>  
> -- 
> 2.42.0
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
  2023-12-15 23:40 ` [PATCH] " Matt Roper
@ 2023-12-16 21:50   ` Thomas Hellström
  0 siblings, 0 replies; 4+ messages in thread
From: Thomas Hellström @ 2023-12-16 21:50 UTC (permalink / raw)
  To: Matt Roper; +Cc: intel-xe

Hi,  Matt

On 12/16/23 00:40, Matt Roper wrote:
> On Fri, Dec 15, 2023 at 01:54:36PM +0100, Thomas Hellström wrote:
>> Since the migrate code is using the identity map for addressing VRAM,
>> copy chunks may become as small as 64K if the VRAM resource is fragmented.
>>
>> However, a chunk size smaller that 1MiB may lead to the *next* chunk's
>> offset into the CCS metadata backup memory may not be page-aligned, and
>> the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could,
>> the current code doesn't handle the offset calculaton correctly.
>>
>> To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If
> Does this need to be device-specific (derived from
> NUM_BYTES_PER_CCS_BYTE)?  On DG2 the main:ccs ratio is 256:1, but on LNL
> (and presumably future platforms) it's 512:1.

Yes, we need to update this once we have a DGFX card with other than 
256:1 (LNL is not affected since we don't use the identity map), but we 
don't know yet whether there might be other changes as well. Perhaps I 
should look at adjusting the 1MiB size alignment based on 
NUM_BYTES_PER_CCS_BYTE.

/Thomas


> Matt
>
>> the remaining data to copy is smaller than that, that's not a problem,
>> so use the remaining size. If the VRAM copy cunk becomes fragmented due
>> to the size alignment restriction, don't use the identity map, but instead
>> emit PTEs into the page-table like we do for system memory.
>>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>> ---
>>   drivers/gpu/drm/xe/tests/xe_migrate.c |  2 +-
>>   drivers/gpu/drm/xe/xe_migrate.c       | 67 ++++++++++++++++-----------
>>   2 files changed, 40 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
>> index 47fcd6e6b777..5f5b416dc88c 100644
>> --- a/drivers/gpu/drm/xe/tests/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
>> @@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
>>   		xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
>>   
>>   	emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt),
>> -		 &src_it, XE_PAGE_SIZE, pt);
>> +		 &src_it, XE_PAGE_SIZE, pt->ttm.resource);
>>   
>>   	run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
>> index 2ca927f3fb2a..0b8a33116322 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -411,14 +411,31 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
>>   
>>   static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur)
>>   {
>> -	/*
>> -	 * For VRAM we use identity mapped pages so we are limited to current
>> -	 * cursor size. For system we program the pages ourselves so we have no
>> -	 * such limitation.
>> -	 */
>> -	return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER,
>> -		     mem_type_is_vram(cur->mem_type) ? cur->size :
>> -		     cur->remaining);
>> +	u64 size = min_t(u64, MAX_PREEMPTDISABLE_TRANSFER, cur->remaining);
>> +
>> +	if (mem_type_is_vram(cur->mem_type)) {
>> +		/*
>> +		 * VRAM we want to blit in chunks with sizes aligned to
>> +		 * 1MiB in order for the offset to CCS metadata to be
>> +		 * page-aligned. If it's the last chunk it may be smaller.
>> +		 *
>> +		 * Another constraint is that we need to limit the blit to
>> +		 * the VRAM block size, unless size is smaller than 1MiB.
>> +		 */
>> +		u64 chunk = max_t(u64, cur->size, SZ_1M);
>> +
>> +		size = min_t(u64, size, chunk);
>> +		if (size > SZ_1M)
>> +			size = round_down(size, SZ_1M);
>> +	}
>> +
>> +	return size;
>> +}
>> +
>> +static bool xe_migrate_avoid_identity(u64 size, const struct xe_res_cursor *cur)
>> +{
>> +	/* The chunk is fragmented. Hence can't use identity map. */
>> +	return cur->size < size;
>>   }
>>   
>>   static u32 pte_update_size(struct xe_migrate *m,
>> @@ -431,7 +448,7 @@ static u32 pte_update_size(struct xe_migrate *m,
>>   	u32 cmds = 0;
>>   
>>   	*L0_pt = pt_ofs;
>> -	if (!is_vram) {
>> +	if (!is_vram || xe_migrate_avoid_identity(*L0, cur)) {
>>   		/* Clip L0 to available size */
>>   		u64 size = min(*L0, (u64)avail_pts * SZ_2M);
>>   		u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>> @@ -461,20 +478,13 @@ static void emit_pte(struct xe_migrate *m,
>>   		     struct xe_bb *bb, u32 at_pt,
>>   		     bool is_vram,
>>   		     struct xe_res_cursor *cur,
>> -		     u32 size, struct xe_bo *bo)
>> +		     u32 size, struct ttm_resource *res)
>>   {
>>   	u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
>>   	u32 ptes;
>>   	u64 ofs = at_pt * XE_PAGE_SIZE;
>>   	u64 cur_ofs;
>>   
>> -	/*
>> -	 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
>> -	 * we're only emitting VRAM PTEs during sanity tests, so when
>> -	 * that's moved to a Kunit test, we should condition VRAM PTEs
>> -	 * on running tests.
>> -	 */
>> -
>>   	ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>>   
>>   	while (ptes) {
>> @@ -498,10 +508,10 @@ static void emit_pte(struct xe_migrate *m,
>>   				if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
>>   				    !(cur_ofs & (16 * 8 - 1))) {
>>   					xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
>> -					flags |= XE_PTE_PS64;
>>   				}
>>   
>> -				addr += vram_region_gpu_offset(bo->ttm.resource);
>> +				addr += vram_region_gpu_offset(res);
>> +				flags |= XE_PTE_PS64;
>>   				devmem = true;
>>   			}
>>   
>> @@ -730,6 +740,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>>   						      &ccs_ofs, &ccs_pt, 0,
>>   						      2 * NUM_PT_PER_BLIT,
>>   						      NUM_PT_PER_BLIT);
>> +			xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
>>   		}
>>   
>>   		/* Add copy commands size here */
>> @@ -742,20 +753,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>>   			goto err_sync;
>>   		}
>>   
>> -		if (!src_is_vram)
>> +		if (!src_is_vram || xe_migrate_avoid_identity(src_L0, &src_it))
>>   			emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
>> -				 src_bo);
>> +				 src);
>>   		else
>>   			xe_res_next(&src_it, src_L0);
>>   
>> -		if (!dst_is_vram)
>> +		if (!dst_is_vram || xe_migrate_avoid_identity(src_L0, &dst_it))
>>   			emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
>> -				 dst_bo);
>> +				 dst);
>>   		else
>>   			xe_res_next(&dst_it, src_L0);
>>   
>>   		if (copy_system_ccs)
>> -			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src_bo);
>> +			emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src);
>>   
>>   		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>   		update_idx = bb->len;
>> @@ -984,12 +995,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>>   		size -= clear_L0;
>>   
>>   		/* Preemption is enabled again by the ring ops. */
>> -		if (!clear_vram) {
>> +		if (!clear_vram || xe_migrate_avoid_identity(clear_L0, &src_it))
>>   			emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
>> -				 bo);
>> -		} else {
>> +				 dst);
>> +		else
>>   			xe_res_next(&src_it, clear_L0);
>> -		}
>> +
>>   		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>   		update_idx = bb->len;
>>   
>> -- 
>> 2.42.0
>>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-12-17  2:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-15 12:54 [PATCH] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks Thomas Hellström
2023-12-15 12:57 ` ✗ CI.Patch_applied: failure for " Patchwork
2023-12-15 23:40 ` [PATCH] " Matt Roper
2023-12-16 21:50   ` Thomas Hellström

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox