Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/xe/lnl: Only do gpu sys page clear for non-pooled BOs
@ 2024-08-21  9:50 Nirmoy Das
  2024-08-21  9:50 ` [PATCH 2/2] Revert "drm/ttm: Add a flag to allow drivers to skip clear-on-free" Nirmoy Das
                   ` (9 more replies)
  0 siblings, 10 replies; 18+ messages in thread
From: Nirmoy Das @ 2024-08-21  9:50 UTC (permalink / raw)
  To: dri-devel
  Cc: intel-xe, Nirmoy Das, Christian König, Himal Prasad Ghimiray,
	Matthew Auld, Matthew Brost, Thomas Hellström

Currently XE lacks clean-on-free implementation so using
TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of
TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing
only for WB cached BOs which are not pooled so there is no need to
return a zeroed pages to a pool.

Without the patch:
api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us
Perf tool top 5 entries:
11.16%  api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page
7.85%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
7.59%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
7.24%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
5.53%  api_overhead_be  [kernel.kallsyms] [k] lookup_address_in_pgd_attr

With the patch:
UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us
UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us
Perf tool top 5 entries:
25.40% api_overhead_be  [kernel.kallsyms] [k] clear_page_erms
9.89%  api_overhead_be  [kernel.kallsyms] [k] pages_are_mergeable
4.64%  api_overhead_be  [kernel.kallsyms] [k] cpa_flush
4.04%  api_overhead_be  [kernel.kallsyms] [k] find_next_iomem_res
3.96%  api_overhead_be  [kernel.kallsyms] [k] mod_find

This is still better than the base case where there was no
page clearing offloading.

Cc: Christian König <christian.koenig@amd.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 6ed0e1955215..a18408d5d185 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -283,6 +283,7 @@ struct xe_ttm_tt {
 	struct device *dev;
 	struct sg_table sgt;
 	struct sg_table *sg;
+	bool clear_system_pages;
 };
 
 static int xe_tt_map_sg(struct ttm_tt *tt)
@@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
 	}
 
 	/*
-	 * If the device can support gpu clear system pages then set proper ttm
+	 * If the device can support gpu clear system pages then set proper
 	 * flag. Zeroed pages are only required for ttm_bo_type_device so
 	 * unwanted data is not leaked to userspace.
+	 *
+	 * XE currently does clear-on-alloc so gpu clear will only work on
+	 * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global pool will
+	 * get poisoned ono-zeroed pages.
 	 */
-	if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys)
-		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
+	if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys &&
+	    bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)
+		tt->clear_system_pages = true;
 
 	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages);
 	if (err) {
@@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
 static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
 			      struct ttm_operation_ctx *ctx)
 {
+	struct xe_ttm_tt *xe_tt;
 	int err;
 
+	xe_tt = container_of(tt, struct xe_ttm_tt, ttm);
+
 	/*
 	 * dma-bufs are not populated with pages, and the dma-
 	 * addresses are set up when moved to XE_PL_TT.
@@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
 		return 0;
 
 	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear system pages */
-	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
+	if (xe_tt->clear_system_pages)
 		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
 
 	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
@@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	struct ttm_resource *old_mem = ttm_bo->resource;
 	u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM;
 	struct ttm_tt *ttm = ttm_bo->ttm;
+	struct xe_ttm_tt *xe_tt = container_of(ttm, struct xe_ttm_tt, ttm);
 	struct xe_migrate *migrate = NULL;
 	struct dma_fence *fence;
 	bool move_lacks_source;
@@ -671,15 +681,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	bool needs_clear;
 	bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
 				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
-	bool clear_system_pages;
+	bool clear_system_pages = false;
 	int ret = 0;
 
 	/*
 	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path when
 	 * moving to system as the bo doesn't have dma_mapping.
 	 */
-	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
-		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
+	if (!old_mem && ttm && !ttm_tt_is_populated(ttm) &&
+	    xe_tt->clear_system_pages)
+		xe_tt->clear_system_pages = false;
 
 	/* Bo creation path, moving to system or TT. */
 	if ((!old_mem && ttm) && !handle_system_ccs) {
@@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
 						(!mem_type_is_vram(old_mem_type) && !tt_has_data);
 
-	clear_system_pages = ttm && (ttm->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE);
+	clear_system_pages = ttm && xe_tt->clear_system_pages;
 	needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
 		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
 		clear_system_pages;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread
* [PATCH 1/2] Revert "drm/xe/lnl: Offload system clear page activity to GPU"
@ 2024-08-28  8:36 Nirmoy Das
  2024-08-28  8:36 ` [PATCH 2/2] Revert "drm/ttm: Add a flag to allow drivers to skip clear-on-free" Nirmoy Das
  0 siblings, 1 reply; 18+ messages in thread
From: Nirmoy Das @ 2024-08-28  8:36 UTC (permalink / raw)
  To: dri-devel
  Cc: intel-xe, Nirmoy Das, Christian König, Himal Prasad Ghimiray,
	Lucas De Marchi, Matthew Auld, Matthew Brost,
	Thomas Hellström

This optimization relied on having to clear CCS on allocations.
If there is no need to clear CCS on allocations then this would mostly
help in reducing CPU utilization.

Revert this patch at this moment because of:
1 Currently Xe can't do clear on free and using a invalid ttm flag,
TTM_TT_FLAG_CLEARED_ON_FREE which could poison global ttm pool on
multi-device setup.

2 Also for LNL CPU:WB doesn't require clearing CCS as such BO will
not be allowed to bind with compression PTE. Subsequent patch will
disable clearing CCS for CPU:WB BOs for LNL.

This reverts commit 23683061805be368c8d1c7e7ff52abc470cac275.

Cc: Christian König <christian.koenig@amd.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c           | 26 ++------------------------
 drivers/gpu/drm/xe/xe_device_types.h |  2 --
 drivers/gpu/drm/xe/xe_ttm_sys_mgr.c  | 12 ------------
 3 files changed, 2 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 9d6632f92fa9..25d0c939ba31 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -396,14 +396,6 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
 		caching = ttm_uncached;
 	}
 
-	/*
-	 * If the device can support gpu clear system pages then set proper ttm
-	 * flag. Zeroed pages are only required for ttm_bo_type_device so
-	 * unwanted data is not leaked to userspace.
-	 */
-	if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys)
-		page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE;
-
 	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages);
 	if (err) {
 		kfree(tt);
@@ -425,10 +417,6 @@ static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt,
 	if (tt->page_flags & TTM_TT_FLAG_EXTERNAL)
 		return 0;
 
-	/* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear system pages */
-	if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE)
-		tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
-
 	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
 	if (err)
 		return err;
@@ -671,16 +659,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	bool needs_clear;
 	bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) &&
 				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
-	bool clear_system_pages;
 	int ret = 0;
 
-	/*
-	 * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path when
-	 * moving to system as the bo doesn't have dma_mapping.
-	 */
-	if (!old_mem && ttm && !ttm_tt_is_populated(ttm))
-		ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE;
-
 	/* Bo creation path, moving to system or TT. */
 	if ((!old_mem && ttm) && !handle_system_ccs) {
 		if (new_mem->mem_type == XE_PL_TT)
@@ -703,10 +683,8 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared)  :
 						(!mem_type_is_vram(old_mem_type) && !tt_has_data);
 
-	clear_system_pages = ttm && (ttm->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE);
 	needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) ||
-		(!ttm && ttm_bo->type == ttm_bo_type_device) ||
-		clear_system_pages;
+		(!ttm && ttm_bo->type == ttm_bo_type_device);
 
 	if (new_mem->mem_type == XE_PL_TT) {
 		ret = xe_tt_map_sg(ttm);
@@ -818,7 +796,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 		if (move_lacks_source) {
 			u32 flags = 0;
 
-			if (mem_type_is_vram(new_mem->mem_type) || clear_system_pages)
+			if (mem_type_is_vram(new_mem->mem_type))
 				flags |= XE_MIGRATE_CLEAR_FLAG_FULL;
 			else if (handle_system_ccs)
 				flags |= XE_MIGRATE_CLEAR_FLAG_CCS_DATA;
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 4ecd620921a3..e73fb0c23932 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -333,8 +333,6 @@ struct xe_device {
 		struct xe_mem_region vram;
 		/** @mem.sys_mgr: system TTM manager */
 		struct ttm_resource_manager sys_mgr;
-		/** @mem.gpu_page_clear_sys: clear system pages offloaded to GPU */
-		bool gpu_page_clear_sys;
 	} mem;
 
 	/** @sriov: device level virtualization data */
diff --git a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c
index e0ac20f20758..9844a8edbfe1 100644
--- a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c
@@ -117,17 +117,5 @@ int xe_ttm_sys_mgr_init(struct xe_device *xe)
 	ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
 	ttm_set_driver_manager(&xe->ttm, XE_PL_TT, man);
 	ttm_resource_manager_set_used(man, true);
-
-	/*
-	 * On iGFX device with flat CCS, we clear CCS metadata, let's extend that
-	 * and use GPU to clear pages as well.
-	 *
-	 * Disable this when init_on_free and/or init_on_alloc is on to avoid double
-	 * zeroing pages with CPU and GPU.
-	 */
-	if (xe_device_has_flat_ccs(xe) && !IS_DGFX(xe) &&
-	    !want_init_on_alloc(GFP_USER) && !want_init_on_free())
-		xe->mem.gpu_page_clear_sys = true;
-
 	return drmm_add_action_or_reset(&xe->drm, ttm_sys_mgr_fini, xe);
 }
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2024-08-28  9:08 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-21  9:50 [PATCH 1/2] drm/xe/lnl: Only do gpu sys page clear for non-pooled BOs Nirmoy Das
2024-08-21  9:50 ` [PATCH 2/2] Revert "drm/ttm: Add a flag to allow drivers to skip clear-on-free" Nirmoy Das
2024-08-27  9:42   ` Thomas Hellström
2024-08-27 16:14     ` Lucas De Marchi
2024-08-28  6:32       ` Christian König
2024-08-21 10:37 ` ✓ CI.Patch_applied: success for series starting with [1/2] drm/xe/lnl: Only do gpu sys page clear for non-pooled BOs Patchwork
2024-08-21 10:38 ` ✓ CI.checkpatch: " Patchwork
2024-08-21 10:39 ` ✓ CI.KUnit: " Patchwork
2024-08-21 10:51 ` ✓ CI.Build: " Patchwork
2024-08-21 10:53 ` ✓ CI.Hooks: " Patchwork
2024-08-21 10:54 ` ✗ CI.checksparse: warning " Patchwork
2024-08-21 11:15 ` ✓ CI.BAT: success " Patchwork
2024-08-21 12:39 ` ✗ CI.FULL: failure " Patchwork
2024-08-28  8:09 ` [PATCH 1/2] " Thomas Hellström
2024-08-28  8:12   ` Thomas Hellström
2024-08-28  8:19     ` Nirmoy Das
2024-08-28  9:08   ` Nirmoy Das
  -- strict thread matches above, loose matches on Subject: below --
2024-08-28  8:36 [PATCH 1/2] Revert "drm/xe/lnl: Offload system clear page activity to GPU" Nirmoy Das
2024-08-28  8:36 ` [PATCH 2/2] Revert "drm/ttm: Add a flag to allow drivers to skip clear-on-free" Nirmoy Das

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox