Re: [PATCH i-g-t v7 06/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU performance test

igt-dev.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

From: "Hellstrom, Thomas" <thomas.hellstrom@intel.com>
To: "igt-dev@lists.freedesktop.org" <igt-dev@lists.freedesktop.org>,
	"Sharma,  Nishit" <nishit.sharma@intel.com>
Subject: Re: [PATCH i-g-t v7 06/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU performance test
Date: Mon, 17 Nov 2025 14:39:26 +0000	[thread overview]
Message-ID: <faee0ba31ca34eb7f1c2c6d0f5fc4a3f8d30ecea.camel@intel.com> (raw)
In-Reply-To: <20251113163308.633818-7-nishit.sharma@intel.com>

On Thu, 2025-11-13 at 16:33 +0000, nishit.sharma@intel.com wrote:
> From: Nishit Sharma <nishit.sharma@intel.com>
> 
> This test measures latency and bandwidth for buffer access from each
> GPU
> and the CPU in a multi-GPU SVM environment. It compares performance
> for
> local versus remote access using madvise and prefetch to control
> buffer
> placement
> 
> Signed-off-by: Nishit Sharma <nishit.sharma@intel.com>
> ---
>  tests/intel/xe_multi_gpusvm.c | 181
> ++++++++++++++++++++++++++++++++++
>  1 file changed, 181 insertions(+)
> 
> diff --git a/tests/intel/xe_multi_gpusvm.c
> b/tests/intel/xe_multi_gpusvm.c
> index 6792ef72c..2c8e62e34 100644
> --- a/tests/intel/xe_multi_gpusvm.c
> +++ b/tests/intel/xe_multi_gpusvm.c
> @@ -13,6 +13,8 @@
>  #include "intel_mocs.h"
>  #include "intel_reg.h"
>  
> +#include "time.h"
> +
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
>  #include "xe/xe_util.h"
> @@ -41,6 +43,11 @@
>   * Description:
>   * 	This test checks coherency in multi-gpu by writing from GPU0
>   * 	reading from GPU1 and verify and repeating with CPU and both
> GPUs
> + *
> + * SUBTEST: latency-multi-gpu
> + * Description:
> + * 	This test measures and compares latency and bandwidth for
> buffer access
> + * 	from CPU, local GPU, and remote GPU
>   */
>  
>  #define MAX_XE_REGIONS	8
> @@ -103,6 +110,11 @@ static void gpu_coherecy_test_wrapper(struct
> xe_svm_gpu_info *src,
>  				      struct
> drm_xe_engine_class_instance *eci,
>  				      void *extra_args);
>  
> +static void gpu_latency_test_wrapper(struct xe_svm_gpu_info *src,
> +				     struct xe_svm_gpu_info *dst,
> +				     struct
> drm_xe_engine_class_instance *eci,
> +				     void *extra_args);
> +
>  static void
>  create_vm_and_queue(struct xe_svm_gpu_info *gpu, struct
> drm_xe_engine_class_instance *eci,
>  		    uint32_t *vm, uint32_t *exec_queue)
> @@ -197,6 +209,11 @@ static void for_each_gpu_pair(int num_gpus,
> struct xe_svm_gpu_info *gpus,
>  
>  static void open_pagemaps(int fd, struct xe_svm_gpu_info *info);
>  
> +static double time_diff(struct timespec *start, struct timespec
> *end)
> +{
> +    return (end->tv_sec - start->tv_sec) + (end->tv_nsec - start-
> >tv_nsec) / 1e9;
> +}
> +
>  static void
>  atomic_batch_init(int fd, uint32_t vm, uint64_t src_addr,
>  		  uint32_t *bo, uint64_t *addr)
> @@ -549,6 +566,147 @@ coherency_test_multigpu(struct xe_svm_gpu_info
> *gpu0,
>  	cleanup_vm_and_queue(gpu1, vm[1], exec_queue[1]);
>  }
>  
> +static void
> +latency_test_multigpu(struct xe_svm_gpu_info *gpu0,
> +		      struct xe_svm_gpu_info *gpu1,
> +		      struct drm_xe_engine_class_instance *eci,
> +		      bool remote_copy,
> +		      bool prefetch_req)
> +{
> +        uint64_t addr;
> +        uint32_t vm[2];
> +        uint32_t exec_queue[2];
> +        uint32_t batch_bo;
> +        uint8_t *copy_dst;
> +        uint64_t batch_addr;
> +        struct drm_xe_sync sync = {};
> +        volatile uint64_t *sync_addr;
> +        int value = 60;
> +        int shared_val[4];
> +        struct test_exec_data *data;
> +	struct timespec t_start, t_end;
> +	double cpu_latency, gpu1_latency, gpu2_latency;
> +	double cpu_bw, gpu1_bw, gpu2_bw;

Also here, indentation looks inconsistent throughout the function.

> +
> +	create_vm_and_queue(gpu0, eci, &vm[0], &exec_queue[0]);
> +	create_vm_and_queue(gpu1, eci, &vm[1], &exec_queue[1]);
> +
> +        data = aligned_alloc(SZ_2M, SZ_4K);
> +        igt_assert(data);
> +        data[0].vm_sync = 0;
> +        addr = to_user_pointer(data);
> +
> +        copy_dst = aligned_alloc(SZ_2M, SZ_4K);
> +        igt_assert(copy_dst);
> +
> +        store_dword_batch_init(gpu0->fd, vm[0], addr, &batch_bo,
> &batch_addr, value);
> +
> +	/* Measure GPU0 access latency/bandwidth */
> +	clock_gettime(CLOCK_MONOTONIC, &t_start);
> +
> +        /* GPU0(src_gpu) access */
> +	xe_multigpu_madvise(gpu0->fd, vm[0], addr, SZ_4K, 0,
> +			    DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC,
> +			    gpu0->fd, 0, gpu0->vram_regions[0],
> exec_queue[0],
> +			    0, 0);
> +
> +	setup_sync(&sync, &sync_addr, BIND_SYNC_VAL);
> +	xe_multigpu_prefetch(gpu0->fd, vm[0], addr, SZ_4K, &sync,
> +			     sync_addr, exec_queue[0],
> prefetch_req);
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_end);

So here we are measuring the madvise and prefetch (if any) latency
only. Is that intentional?

> +	gpu1_latency = time_diff(&t_start, &t_end);
> +	gpu1_bw = COPY_SIZE / gpu1_latency / (1024 * 1024); // MB/s

Aren't we doing a single dword store? Why are you calculating bandwidth
based on COPY_SIZE?

> +
> +        sync_addr = (void *)((char *)batch_addr + SZ_4K);
> +        sync.addr = to_user_pointer((uint64_t *)sync_addr);
> +        sync.timeline_value = EXEC_SYNC_VAL;
> +        *sync_addr = 0;
> +
> +        /* Execute STORE command on GPU0 */
> +        xe_exec_sync(gpu0->fd, exec_queue[0], batch_addr, &sync, 1);
> +        if (*sync_addr != EXEC_SYNC_VAL)
> +                xe_wait_ufence(gpu0->fd, (uint64_t *)sync_addr,
> EXEC_SYNC_VAL, exec_queue[0],
> +			       NSEC_PER_SEC * 10);
> +

Why isn't the GPU execution included in the timing?


> +	memcpy(shared_val, (void *)addr, 4);

I think I asked about this a couple of times before, but why are you
doing a memcpy to "shared_val" here?


> +	igt_assert_eq(shared_val[0], value);
> +
> +        /* CPU writes 10, memset set bytes no integer hence memset
> fills 4 bytes with 0x0A */
> +        memset((void *)(uintptr_t)addr, 10, sizeof(int));
> +        memcpy(shared_val, (void *)(uintptr_t)addr,
> sizeof(shared_val));
> +        igt_assert_eq(shared_val[0], 0x0A0A0A0A);


And here? Also why this exercise of setting add to 0x0a0a0a0a? isn't
this a performance / latency test.

> +
> +	*(uint64_t *)addr = 50;
> +
> +	if(remote_copy) {
> +		igt_info("creating batch for COPY_CMD on GPU1\n");
> +		batch_init(gpu1->fd, vm[1], addr,
> to_user_pointer(copy_dst),
> +			   SZ_4K, &batch_bo, &batch_addr);
> +	} else {
> +		igt_info("creating batch for STORE_CMD on GPU1\n");
> +		store_dword_batch_init(gpu1->fd, vm[1], addr,
> &batch_bo, &batch_addr, value + 10);
> +	}
> +
> +	/* Measure GPU1 access latency/bandwidth */
> +	clock_gettime(CLOCK_MONOTONIC, &t_start);

Here we also include the madvise- and prefetch timing. Is that
intentional.

> +
> +        /* GPU1(dst_gpu) access */
> +	xe_multigpu_madvise(gpu1->fd, vm[1], addr, SZ_4K, 0,
> +			    DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC,
> +			    gpu1->fd, 0, gpu1->vram_regions[0],
> exec_queue[1],
> +			    0, 0);

Setting madvise to gpu1 will migrate the data to gpu1 at prefetch or
fault time? Is that really what's intended?

> +
> +	setup_sync(&sync, &sync_addr, BIND_SYNC_VAL);
> +	xe_multigpu_prefetch(gpu1->fd, vm[1], addr, SZ_4K, &sync,
> +			     sync_addr, exec_queue[1],
> prefetch_req);
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_end);
> +	gpu2_latency = time_diff(&t_start, &t_end);
> +	gpu2_bw = COPY_SIZE / gpu2_latency / (1024 * 1024); // MB/s

Same comment as previosly. Shouldn't we be timing GPU execution. Also
COPY_SIZE seems wrong

> +
> +        sync_addr = (void *)((char *)batch_addr + SZ_4K);
> +        sync.addr = to_user_pointer((uint64_t *)sync_addr);
> +        sync.timeline_value = EXEC_SYNC_VAL;
> +        *sync_addr = 0;
> +
> +        /* Execute COPY/STORE command on GPU1 */
> +        xe_exec_sync(gpu1->fd, exec_queue[1], batch_addr, &sync, 1);

Shouldn't this be included in the timing?

> +        if (*sync_addr != EXEC_SYNC_VAL)
> +                xe_wait_ufence(gpu1->fd, (uint64_t *)sync_addr,
> EXEC_SYNC_VAL, exec_queue[1],
> +			       NSEC_PER_SEC * 10);
> +
> +	if (!remote_copy)
> +		igt_assert_eq(*(uint64_t *)addr, value + 10);
> +	else
> +		igt_assert_eq(*(uint64_t *)copy_dst, 50);
> +
> +        /* CPU writes 11, memset set bytes no integer hence memset
> fills 4 bytes with 0x0B */
> +	/* Measure CPU access latency/bandwidth */
> +	clock_gettime(CLOCK_MONOTONIC, &t_start);
> +        memset((void *)(uintptr_t)addr, 11, sizeof(int));
> +        memcpy(shared_val, (void *)(uintptr_t)addr,
> sizeof(shared_val));
> +	clock_gettime(CLOCK_MONOTONIC, &t_end);

I don't think it's meaningful to measure a single dword CPU write and
read in this way. In theory, the CPU accesses may actually never reach
physical memory. Suggest skipping the CPU timing.

> +	cpu_latency = time_diff(&t_start, &t_end);
> +	cpu_bw = COPY_SIZE / cpu_latency / (1024 * 1024); // MB/s
> +
> +        igt_assert_eq(shared_val[0], 0x0B0B0B0B);
> +
> +	/* Print results */
> +	igt_info("CPU: Latency %.6f s, Bandwidth %.2f MB/s\n",
> cpu_latency, cpu_bw);
> +	igt_info("GPU: Latency %.6f s, Bandwidth %.2f MB/s\n",
> gpu1_latency, gpu1_bw);
> +	igt_info("GPU: Latency %.6f s, Bandwidth %.2f MB/s\n",
> gpu2_latency, gpu2_bw);
> +
> +        munmap((void *)batch_addr, BATCH_SIZE(gpu0->fd));
> +        batch_fini(gpu0->fd, vm[0], batch_bo, batch_addr);
> +        batch_fini(gpu1->fd, vm[1], batch_bo, batch_addr);
> +        free(data);
> +        free(copy_dst);
> +
> +	cleanup_vm_and_queue(gpu0, vm[0], exec_queue[0]);
> +	cleanup_vm_and_queue(gpu1, vm[1], exec_queue[1]);
> +}
> +
>  static void
>  atomic_inc_op(struct xe_svm_gpu_info *gpu0,
>  	      struct xe_svm_gpu_info *gpu1,
> @@ -661,6 +819,19 @@ gpu_coherecy_test_wrapper(struct xe_svm_gpu_info
> *src,
>  	coherency_test_multigpu(src, dst, eci, args->op_mod, args-
> >prefetch_req);
>  }
>  
> +static void
> +gpu_latency_test_wrapper(struct xe_svm_gpu_info *src,
> +			 struct xe_svm_gpu_info *dst,
> +			 struct drm_xe_engine_class_instance *eci,
> +			 void *extra_args)
> +{
> +	struct multigpu_ops_args *args = (struct multigpu_ops_args
> *)extra_args;
> +	igt_assert(src);
> +	igt_assert(dst);
> +
> +	latency_test_multigpu(src, dst, eci, args->op_mod, args-
> >prefetch_req);
> +}
> +
>  igt_main
>  {
>  	struct xe_svm_gpu_info gpus[MAX_XE_GPUS];
> @@ -718,6 +889,16 @@ igt_main
>  		for_each_gpu_pair(gpu_cnt, gpus, &eci,
> gpu_coherecy_test_wrapper, &coh_args);
>  	}
>  
> +	igt_subtest("latency-multi-gpu") {
> +		struct multigpu_ops_args latency_args;
> +		latency_args.prefetch_req = 1;
> +		latency_args.op_mod = 1;
> +		for_each_gpu_pair(gpu_cnt, gpus, &eci,
> gpu_latency_test_wrapper, &latency_args);
> +		latency_args.prefetch_req = 0;
> +		latency_args.op_mod = 0;
> +		for_each_gpu_pair(gpu_cnt, gpus, &eci,
> gpu_latency_test_wrapper, &latency_args);
> +	}

Suggest using a flag variable similar to other xe igts, as discussed
previously.

/Thomas


> +
>  	igt_fixture {
>  		int cnt;
>

next prev parent reply	other threads:[~2025-11-17 14:39 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-13 16:32 [PATCH i-g-t v7 00/10] Madvise feature in SVM for Multi-GPU configs nishit.sharma
2025-11-13 16:33 ` [PATCH i-g-t v7 01/10] lib/xe: Add instance parameter to xe_vm_madvise and introduce lr_sync helpers nishit.sharma
2025-11-17 12:34   ` Hellstrom, Thomas
2025-11-17 15:43     ` Sharma, Nishit
2025-11-18  9:23       ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 02/10] tests/intel/xe_exec_system_allocator: Add parameter in madvise call nishit.sharma
2025-11-17 12:38   ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 03/10] tests/intel/xe_multi_gpusvm: Add SVM multi-GPU cross-GPU memory access test nishit.sharma
2025-11-17 13:00   ` Hellstrom, Thomas
2025-11-17 15:49     ` Sharma, Nishit
2025-11-17 20:40       ` Hellstrom, Thomas
2025-11-18  9:24       ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 04/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU atomic operations nishit.sharma
2025-11-17 13:10   ` Hellstrom, Thomas
2025-11-17 15:50     ` Sharma, Nishit
2025-11-18  9:26       ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 05/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU coherency test nishit.sharma
2025-11-17 14:02   ` Hellstrom, Thomas
2025-11-17 16:18     ` Sharma, Nishit
2025-11-27  7:36       ` Gurram, Pravalika
2025-11-13 16:33 ` [PATCH i-g-t v7 06/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU performance test nishit.sharma
2025-11-17 14:39   ` Hellstrom, Thomas [this message]
2025-11-13 16:33 ` [PATCH i-g-t v7 07/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU fault handling test nishit.sharma
2025-11-17 14:48   ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 08/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU simultaneous access test nishit.sharma
2025-11-17 14:57   ` Hellstrom, Thomas
2025-11-13 16:33 ` [PATCH i-g-t v7 09/10] tests/intel/xe_multi_gpusvm.c: Add SVM multi-GPU conflicting madvise test nishit.sharma
2025-11-17 15:11   ` Hellstrom, Thomas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=faee0ba31ca34eb7f1c2c6d0f5fc4a3f8d30ecea.camel@intel.com \
    --to=thomas.hellstrom@intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=nishit.sharma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).