Re: [PATCH v11 07/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU fault handling test

igt-dev.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

From: "Hellstrom, Thomas" <thomas.hellstrom@intel.com>
To: "igt-dev@lists.freedesktop.org" <igt-dev@lists.freedesktop.org>,
	"Sharma,  Nishit" <nishit.sharma@intel.com>
Subject: Re: [PATCH v11 07/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU fault handling test
Date: Thu, 4 Dec 2025 07:16:03 +0000	[thread overview]
Message-ID: <2a11b91ecbcddc785821ccacaf6f9eb98133f6a0.camel@intel.com> (raw)
In-Reply-To: <20251204051051.27495-8-nishit.sharma@intel.com>

On Thu, 2025-12-04 at 05:10 +0000, nishit.sharma@intel.com wrote:
> From: Nishit Sharma <nishit.sharma@intel.com>
> 
> This test intentionally triggers page faults by accessing regions
> without
> prefetch for both GPUs in a multi-GPU environment.
> 
> Signed-off-by: Nishit Sharma <nishit.sharma@intel.com>
> Reviewed-by: Pravalika Gurram <pravalika.gurram@intel.com>

Acked-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>


> ---
>  tests/intel/xe_multigpu_svm.c | 143
> ++++++++++++++++++++++++++++++++++
>  1 file changed, 143 insertions(+)
> 
> diff --git a/tests/intel/xe_multigpu_svm.c
> b/tests/intel/xe_multigpu_svm.c
> index d37043e16..4a0a28229 100644
> --- a/tests/intel/xe_multigpu_svm.c
> +++ b/tests/intel/xe_multigpu_svm.c
> @@ -15,6 +15,7 @@
>  
>  #include "time.h"
>  
> +#include "xe/xe_gt.h"
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
>  #include "xe/xe_util.h"
> @@ -89,6 +90,17 @@
>   *	Measure latency of cross-GPU memory copy operations with
> prefetch
>   *	to evaluate copy performance with memory migration to local
> VRAM
>   *
> + * SUBTEST: mgpu-pagefault-basic
> + * Description:
> + *	Test cross-GPU page fault handling where one GPU writes to
> memory
> + *	and another GPU reads, triggering page faults without
> prefetch to
> + *	validate on-demand page migration across GPUs
> + *
> + * SUBTEST: mgpu-pagefault-prefetch
> + * Description:
> + *	Test cross-GPU memory access with prefetch to verify page
> fault
> + *	suppression when memory is pre-migrated to target GPU's VRAM
> + *
>   */
>  
>  #define MAX_XE_REGIONS	8
> @@ -108,6 +120,7 @@
>  #define MULTIGPU_COH_FAIL		BIT(5)
>  #define MULTIGPU_PERF_OP		BIT(6)
>  #define MULTIGPU_PERF_REM_COPY		BIT(7)
> +#define MULTIGPU_PFAULT_OP		BIT(8)
>  
>  #define INIT	2
>  #define STORE	3
> @@ -163,6 +176,11 @@ static void gpu_latency_test_wrapper(struct
> xe_svm_gpu_info *src,
>  				     struct
> drm_xe_engine_class_instance *eci,
>  				     unsigned int flags);
>  
> +static void gpu_fault_test_wrapper(struct xe_svm_gpu_info *src,
> +				   struct xe_svm_gpu_info *dst,
> +				   struct
> drm_xe_engine_class_instance *eci,
> +				   unsigned int flags);
> +
>  static void
>  create_vm_and_queue(struct xe_svm_gpu_info *gpu, struct
> drm_xe_engine_class_instance *eci,
>  		    uint32_t *vm, uint32_t *exec_queue)
> @@ -924,6 +942,115 @@ latency_test_multigpu(struct xe_svm_gpu_info
> *gpu1,
>  	cleanup_vm_and_queue(gpu2, vm[1], exec_queue[1]);
>  }
>  
> +static void
> +pagefault_test_multigpu(struct xe_svm_gpu_info *gpu1,
> +			struct xe_svm_gpu_info *gpu2,
> +			struct drm_xe_engine_class_instance *eci,
> +			unsigned int flags)
> +{
> +	uint64_t addr;
> +	uint64_t addr1;
> +	uint32_t vm[2];
> +	uint32_t exec_queue[2];
> +	uint32_t batch_bo[2];
> +	uint64_t batch_addr[2];
> +	struct drm_xe_sync sync = {};
> +	uint64_t *sync_addr;
> +	void *data, *verify_result;
> +	const char *pf_count_stat = "svm_pagefault_count";
> +	int pf_count_gpu1_before, pf_count_gpu1_after;
> +	int pf_count_gpu2_before, pf_count_gpu2_after;
> +	bool prefetch_req = flags & MULTIGPU_PREFETCH;
> +
> +	/* Skip if either GPU doesn't support faults */
> +	if (mgpu_check_fault_support(gpu1, gpu2))
> +		return;
> +
> +	create_vm_and_queue(gpu1, eci, &vm[0], &exec_queue[0]);
> +	create_vm_and_queue(gpu2, eci, &vm[1], &exec_queue[1]);
> +
> +	data = aligned_alloc(SZ_2M, SZ_4K);
> +	igt_assert(data);
> +	memset(data, 0, SZ_4K);
> +	addr = to_user_pointer(data);
> +
> +	/* Allocate verification buffer for GPU2 to copy into */
> +	verify_result = aligned_alloc(SZ_2M, SZ_4K);
> +	igt_assert(verify_result);
> +	addr1 = to_user_pointer(verify_result);
> +
> +	/* === Phase 1: GPU1 writes to addr === */
> +	pf_count_gpu1_before = xe_gt_stats_get_count(gpu1->fd, eci-
> >gt_id, pf_count_stat);
> +
> +	/* GPU1 --> Creating batch with value and executing STORE op
> */
> +	gpu_batch_create(gpu1, vm[0], exec_queue[0], addr, 0,
> +			 &batch_bo[0], &batch_addr[0], flags,
> DWORD);
> +
> +	/*GPU1: Madvise and Prefetch Ops */
> +	gpu_madvise_exec_sync(gpu1, vm[0], exec_queue[0], addr,
> &batch_addr[0], flags, NULL);
> +
> +	pf_count_gpu1_after = xe_gt_stats_get_count(gpu1->fd, eci-
> >gt_id, pf_count_stat);
> +
> +	if (prefetch_req) {
> +		/* With prefetch: expect NO page faults */
> +		igt_assert_eq(pf_count_gpu1_after,
> pf_count_gpu1_before);
> +		igt_info("GPU1 write with prefetch: No page faults
> (as expected)\n");
> +	} else {
> +		/* Without prefetch: expect page faults */
> +		igt_warn_on_f(pf_count_gpu1_after >
> pf_count_gpu1_before,
> +			      "%d page faults generated on GPU1\n",
> +			      pf_count_gpu1_after -
> pf_count_gpu1_before);
> +		igt_info("GPU1 write without prefetch: %d page
> faults\n",
> +			 pf_count_gpu1_after -
> pf_count_gpu1_before);
> +	}
> +
> +	/* === Phase 2: GPU2 reads from addr (cross-GPU access) ===
> */
> +	pf_count_gpu2_before = xe_gt_stats_get_count(gpu2->fd, eci-
> >gt_id, pf_count_stat);
> +
> +	/* GPU2 --> Create batch for GPU2 to copy from addr (GPU1's
> memory) to verify_result */
> +	gpu_batch_create(gpu2, vm[1], exec_queue[1], addr, addr1,
> +			 &batch_bo[1], &batch_addr[1], flags, INIT);
> +
> +	/* Prefetch src buffer (addr) to avoid page faults */
> +	xe_multigpu_madvise(gpu2->fd, vm[1], addr, SZ_4K, 0,
> +			    DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC,
> +			    gpu2->fd, 0, gpu2->vram_regions[0],
> exec_queue[1]);
> +
> +	setup_sync(&sync, &sync_addr, BIND_SYNC_VAL);
> +	xe_multigpu_prefetch(gpu2->fd, vm[1], addr, SZ_4K, &sync,
> +			     sync_addr, exec_queue[1], flags);
> +
> +	free(sync_addr);
> +
> +	/*GPU2: Madvise and Prefetch Ops */
> +	gpu_madvise_exec_sync(gpu2, vm[1], exec_queue[1], addr1,
> &batch_addr[1], flags, NULL);
> +
> +	pf_count_gpu2_after = xe_gt_stats_get_count(gpu2->fd, eci-
> >gt_id, pf_count_stat);
> +
> +	if (prefetch_req) {
> +		/* With prefetch: expect NO page faults on GPU2 */
> +		igt_assert_eq(pf_count_gpu2_after,
> pf_count_gpu2_before);
> +		igt_info("GPU2 cross-GPU read with prefetch: No page
> faults (as expected)\n");
> +	} else {
> +		/* Without prefetch: expect cross-GPU page faults */
> +		igt_warn_on_f(pf_count_gpu2_after >
> pf_count_gpu2_before,
> +			      "%d page faults generated on GPU2\n",
> +			      pf_count_gpu2_after -
> pf_count_gpu2_before);
> +		igt_info("GPU2 cross-GPU read without prefetch: %d
> page faults\n",
> +			 pf_count_gpu2_after -
> pf_count_gpu2_before);
> +	}
> +
> +	munmap((void *)batch_addr[0], BATCH_SIZE(gpu1->fd));
> +	munmap((void *)batch_addr[1], BATCH_SIZE(gpu2->fd));
> +	batch_fini(gpu1->fd, vm[0], batch_bo[0], batch_addr[0]);
> +	batch_fini(gpu2->fd, vm[1], batch_bo[1], batch_addr[0]);
> +	free(data);
> +	free(verify_result);
> +
> +	cleanup_vm_and_queue(gpu1, vm[0], exec_queue[0]);
> +	cleanup_vm_and_queue(gpu2, vm[1], exec_queue[1]);
> +}
> +
>  static void
>  gpu_mem_access_wrapper(struct xe_svm_gpu_info *src,
>  		       struct xe_svm_gpu_info *dst,
> @@ -972,6 +1099,18 @@ gpu_latency_test_wrapper(struct xe_svm_gpu_info
> *src,
>  	latency_test_multigpu(src, dst, eci, flags);
>  }
>  
> +static void
> +gpu_fault_test_wrapper(struct xe_svm_gpu_info *src,
> +		       struct xe_svm_gpu_info *dst,
> +		       struct drm_xe_engine_class_instance *eci,
> +		       unsigned int flags)
> +{
> +	igt_assert(src);
> +	igt_assert(dst);
> +
> +	pagefault_test_multigpu(src, dst, eci, flags);
> +}
> +
>  static void
>  test_mgpu_exec(int gpu_cnt, struct xe_svm_gpu_info *gpus,
>  	       struct drm_xe_engine_class_instance *eci,
> @@ -985,6 +1124,8 @@ test_mgpu_exec(int gpu_cnt, struct
> xe_svm_gpu_info *gpus,
>  		for_each_gpu_pair(gpu_cnt, gpus, eci,
> gpu_coherecy_test_wrapper, flags);
>  	if (flags & MULTIGPU_PERF_OP)
>  		for_each_gpu_pair(gpu_cnt, gpus, eci,
> gpu_latency_test_wrapper, flags);
> +	if (flags & MULTIGPU_PFAULT_OP)
> +		for_each_gpu_pair(gpu_cnt, gpus, eci,
> gpu_fault_test_wrapper, flags);
>  }
>  
>  struct section {
> @@ -1020,6 +1161,8 @@ igt_main
>  		{ "latency-prefetch", MULTIGPU_PREFETCH |
> MULTIGPU_PERF_OP },
>  		{ "latency-copy-prefetch",
>  		  MULTIGPU_PREFETCH | MULTIGPU_PERF_OP |
> MULTIGPU_PERF_REM_COPY },
> +		{ "pagefault-basic", MULTIGPU_PFAULT_OP },
> +		{ "pagefault-prefetch", MULTIGPU_PREFETCH |
> MULTIGPU_PFAULT_OP },
>  		{ NULL },
>  	};
>

next prev parent reply	other threads:[~2025-12-04  7:16 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-04  5:10 [PATCH v11 00/10] Madvise feature in SVM for Multi-GPU configs nishit.sharma
2025-12-04  5:10 ` [PATCH v11 01/10] DONT_MERGE: lib/xe: Add instance parameter to xe_vm_madvise nishit.sharma
2025-12-04  5:10 ` [PATCH v11 02/10] lib/xe: Add synchronous helpers for VM bind/unbind operations nishit.sharma
2025-12-04  5:10 ` [PATCH v11 03/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU xGPU memory access test nishit.sharma
2025-12-04  5:10 ` [PATCH v11 04/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU atomic operations nishit.sharma
2025-12-04  5:10 ` [PATCH v11 05/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU coherency test nishit.sharma
2025-12-04  5:10 ` [PATCH v11 06/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU performance test nishit.sharma
2025-12-04  8:42   ` Hellstrom, Thomas
2025-12-04  8:45     ` Sharma, Nishit
2025-12-04  8:48       ` Hellstrom, Thomas
2025-12-04  5:10 ` [PATCH v11 07/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU fault handling test nishit.sharma
2025-12-04  7:16   ` Hellstrom, Thomas [this message]
2025-12-04  5:10 ` [PATCH v11 08/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU simultaneous access test nishit.sharma
2025-12-04  5:10 ` [PATCH v11 09/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU conflicting madvise test nishit.sharma
2025-12-04  7:24   ` Hellstrom, Thomas
2025-12-04  5:10 ` [PATCH v11 10/10] tests/intel/xe_multigpu_svm: Add SVM multi-GPU migration test nishit.sharma
2025-12-04  6:18 ` ✓ Xe.CI.BAT: success for Madvise feature in SVM for Multi-GPU configs Patchwork
2025-12-04  6:21 ` ✓ i915.CI.BAT: " Patchwork
2025-12-04  7:27 ` ✗ Xe.CI.Full: failure " Patchwork
2025-12-05 14:06 ` ✓ i915.CI.Full: success " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2a11b91ecbcddc785821ccacaf6f9eb98133f6a0.camel@intel.com \
    --to=thomas.hellstrom@intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=nishit.sharma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).