Igt-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Manszewski, Christoph" <christoph.manszewski@intel.com>
To: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>,
	igt-dev@lists.freedesktop.org
Cc: jonathan.cavitt@intel.com, mika.kuoppala@intel.com,
	dominik.grzegorzek@intel.com
Subject: Re: [PATCH i-g-t 4/4] tests/intel/xe_eudebug_online: Add read/write pagefault online tests
Date: Tue, 19 Nov 2024 17:49:18 +0100	[thread overview]
Message-ID: <7b6ec309-ed43-4b34-ba5c-694a9c8a715c@intel.com> (raw)
In-Reply-To: <20241115141132.866838-5-gwan-gyeong.mun@intel.com>

Hi Gwan-gyeong,

On 15.11.2024 15:11, Gwan-gyeong Mun wrote:
> Add read and write pagefault tests to xe_eudebug_online that checks if a
> pagefault event is submitted by the KMD debugger when a pagefault occurs.

For some reason when running the test with the '--debug' option it seems 
like there are events missing in the debugger log. I haven't been able 
to spot whether that's a problem on the kmd or igt side, but that seems 
only to be the case for the page fault tests.

Yes, the test passing despite this... is not good. That inevitable igt 
event processing rewrite is nocking on the door. But the debugger log 
itself should report all events the debugger has sent - so that needs 
some attention.

> 
> Test that read (load instruction) and write(store instruction) attempt to
> load or store access to unallocated memory, causing a pagefault.
> Examine the address causing the page fault and the number of eu threads
> causing the pagefault.
> 
> Co-developed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> Signed-off-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
> ---
>   tests/intel/xe_eudebug_online.c | 219 +++++++++++++++++++++++++++++++-
>   1 file changed, 215 insertions(+), 4 deletions(-)
> 
> diff --git a/tests/intel/xe_eudebug_online.c b/tests/intel/xe_eudebug_online.c
> index 0ef0d8093..eae0eb520 100644
> --- a/tests/intel/xe_eudebug_online.c
> +++ b/tests/intel/xe_eudebug_online.c
> @@ -36,6 +36,8 @@
>   #define BB_IN_VRAM			(1 << 11)
>   #define TARGET_IN_SRAM			(1 << 12)
>   #define TARGET_IN_VRAM			(1 << 13)
> +#define SHADER_PAGEFAULT_READ		(1 << 14)
> +#define SHADER_PAGEFAULT_WRITE		(1 << 15)
>   #define TRIGGER_UFENCE_SET_BREAKPOINT	(1 << 24)
>   #define TRIGGER_RESUME_SINGLE_WALK	(1 << 25)
>   #define TRIGGER_RESUME_PARALLEL_WALK	(1 << 26)
> @@ -45,6 +47,7 @@
>   #define TRIGGER_RESUME_DSS		(1 << 30)
>   #define TRIGGER_RESUME_ONE		(1 << 31)
>   
> +#define SHADER_PAGEFAULT	(SHADER_PAGEFAULT_READ | SHADER_PAGEFAULT_WRITE)
>   #define BB_REGION_BITMASK	(BB_IN_SRAM | BB_IN_VRAM)
>   #define TARGET_REGION_BITMASK	(TARGET_IN_SRAM | TARGET_IN_VRAM)
>   
> @@ -61,6 +64,8 @@
>   #define CACHING_VALUE(n)	(CACHING_INIT_VALUE + (n))
>   
>   #define SHADER_CANARY 0x01010101
> +#define BAD_CANARY 0xf1f1f1f
> +#define BAD_OFFSET (0x12345678ull << 12)
>   
>   #define WALKER_X_DIM		4
>   #define WALKER_ALIGNMENT	16
> @@ -123,6 +128,9 @@ static int get_number_of_threads(uint64_t flags)
>   	if (flags & SHADER_MIN_THREADS)
>   		return 16;
>   
> +	if (flags & SHADER_PAGEFAULT)
> +		return 16;

Nit: could be merged together with the above.

> +
>   	if (flags & (TRIGGER_RESUME_ONE | TRIGGER_RESUME_SINGLE_WALK |
>   		     TRIGGER_RESUME_PARALLEL_WALK | SHADER_CACHING_SRAM | SHADER_CACHING_VRAM))
>   		return 32;
> @@ -179,6 +187,16 @@ static struct gpgpu_shader *get_shader(int fd, const unsigned int flags)
>   			gpgpu_shader__common_target_write_u32(shader, s_dim.y + i, CACHING_VALUE(i));
>   		gpgpu_shader__nop(shader);
>   		gpgpu_shader__breakpoint(shader);
> +	} else if (flags & SHADER_PAGEFAULT) {
> +		if (flags & SHADER_PAGEFAULT_READ)
> +			gpgpu_shader__read_page_fault(shader, BAD_OFFSET);
> +		else if (flags & SHADER_PAGEFAULT_WRITE)
> +			gpgpu_shader__write_offset(shader, BAD_OFFSET, BAD_CANARY);
> +
> +		gpgpu_shader__label(shader, 0);
> +		gpgpu_shader__write_dword(shader, SHADER_CANARY, 0);
> +		gpgpu_shader__jump_neq(shader, 0, w_dim.y, STEERING_END_LOOP);
> +		gpgpu_shader__write_dword(shader, SHADER_CANARY, 0);
>   	}
>   
>   	gpgpu_shader__eot(shader);
> @@ -217,6 +235,17 @@ static int count_set_bits(void *ptr, size_t size)
>   	return count;
>   }
>   
> +static int
> +eu_attentions_xor_count(const uint32_t *a, const uint32_t *b, uint32_t size)

Nit: the current checkpatch line limit is 100 characters which means 
this would fit in a single line.

> +{
> +	int count = 0;
> +
> +	for (int i = 0; i < size / 4 ; i++)
> +		count += igt_hweight(a[i] ^ b[i]);
> +
> +	return count;
> +}
> +
>   static int count_canaries_eq(uint32_t *ptr, struct dim_t w_dim, uint32_t value)
>   {
>   	int count = 0;
> @@ -636,7 +665,7 @@ static void eu_attention_resume_trigger(struct xe_eudebug_debugger *d,
>   		}
>   	}
>   
> -	if (d->flags & SHADER_LOOP) {
> +	if (d->flags & (SHADER_LOOP | SHADER_PAGEFAULT)) {
>   		uint32_t threads = get_number_of_threads(d->flags);
>   		uint32_t val = STEERING_END_LOOP;
>   
> @@ -746,6 +775,43 @@ static void eu_attention_resume_single_step_trigger(struct xe_eudebug_debugger *
>   			data->single_step_bitmask[i] &= ~att->bitmask[i];
>   }
>   
> +static void eu_attention_resume_pagefault_trigger(struct xe_eudebug_debugger *d,
> +						  struct drm_xe_eudebug_event *e)
> +{
> +	struct drm_xe_eudebug_event_eu_attention *att = (void *) e;
> +	struct online_debug_data *data = d->ptr;
> +	uint32_t bitmask_size = att->bitmask_size;
> +	uint8_t *bitmask;
> +
> +	if (data->last_eu_control_seqno > att->base.seqno)
> +		return;
> +
> +	bitmask = calloc(1, att->bitmask_size);
> +
> +	eu_ctl_stopped(d->fd, att->client_handle, att->exec_queue_handle,
> +		       att->lrc_handle, bitmask, &bitmask_size);
> +	igt_assert(bitmask_size == att->bitmask_size);
> +
> +	pthread_mutex_lock(&data->mutex);
> +
> +	if (d->flags & SHADER_PAGEFAULT) {
> +		uint32_t threads = get_number_of_threads(d->flags);
> +		uint32_t val = STEERING_END_LOOP;
> +
> +		igt_assert_eq(pwrite(data->vm_fd, &val, sizeof(uint32_t),
> +				     data->target_offset + steering_offset(threads)),
> +			      sizeof(uint32_t));
> +		fsync(data->vm_fd);
> +	}
> +	pthread_mutex_unlock(&data->mutex);
> +
> +	data->last_eu_control_seqno = eu_ctl_resume(d->master_fd, d->fd, att->client_handle,
> +						    att->exec_queue_handle, att->lrc_handle,
> +						    bitmask, att->bitmask_size);
> +
> +	free(bitmask);
> +}
> +
>   static void open_trigger(struct xe_eudebug_debugger *d,
>   			 struct drm_xe_eudebug_event *e)
>   {
> @@ -1015,7 +1081,7 @@ static void run_online_client(struct xe_eudebug_client *c)
>   	struct intel_bb *ibb;
>   	struct intel_buf *buf;
>   	uint32_t *ptr;
> -	int fd;
> +	int fd, vm_flags;
>   
>   	metadata[0] = calloc(2, sizeof(*metadata));
>   	metadata[1] = calloc(2, sizeof(*metadata));
> @@ -1025,7 +1091,7 @@ static void run_online_client(struct xe_eudebug_client *c)
>   	fd = xe_eudebug_client_open_driver(c);
>   
>   	/* Additional memory for steering control */
> -	if (c->flags & SHADER_LOOP || c->flags & SHADER_SINGLE_STEP)
> +	if (c->flags & SHADER_LOOP || c->flags & SHADER_SINGLE_STEP || c->flags & SHADER_PAGEFAULT)
>   		s_dim.y++;
>   	/* Additional memory for caching check */
>   	if ((c->flags & SHADER_CACHING_SRAM) || (c->flags & SHADER_CACHING_VRAM))
> @@ -1045,7 +1111,11 @@ static void run_online_client(struct xe_eudebug_client *c)
>   							   DRM_XE_DEBUG_METADATA_PROGRAM_MODULE,
>   							   2 * sizeof(*metadata), metadata[1]);
>   
> -	create.vm_id = xe_eudebug_client_vm_create(c, fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
> +	vm_flags = DRM_XE_VM_CREATE_FLAG_LR_MODE;
> +	vm_flags |= c->flags & SHADER_PAGEFAULT ? DRM_XE_VM_CREATE_FLAG_FAULT_MODE : 0;
> +
> +	create.vm_id = xe_eudebug_client_vm_create(c, fd, vm_flags, 0);
> +
>   	xe_eudebug_client_exec_queue_create(c, fd, &create);
>   
>   	ibb = xe_bb_create_on_offset(fd, create.exec_queue_id, create.vm_id, bb_offset, bb_size,
> @@ -1245,11 +1315,14 @@ match_attention_with_exec_queue(struct xe_eudebug_event_log *log,
>   static void online_session_check(struct xe_eudebug_session *s, int flags)
>   {
>   	struct drm_xe_eudebug_event_eu_attention *ea = NULL;
> +	struct drm_xe_eudebug_event_pagefault *pf = NULL;
>   	struct drm_xe_eudebug_event *event = NULL;
>   	struct online_debug_data *data = s->client->ptr;
>   	bool expect_exception = flags & DISABLE_DEBUG_MODE ? false : true;
>   	int sum = 0;
>   	int bitmask_size;
> +	int pagefault_threads = 0;
> +	uint32_t *ptr = NULL;
>   
>   	xe_eudebug_session_check(s, true, XE_EUDEBUG_FILTER_EVENT_VM_BIND |
>   					  XE_EUDEBUG_FILTER_EVENT_VM_BIND_OP |
> @@ -1265,6 +1338,16 @@ static void online_session_check(struct xe_eudebug_session *s, int flags)
>   			igt_assert_eq(ea->bitmask_size, bitmask_size);
>   			sum += count_set_bits(ea->bitmask, bitmask_size);
>   			igt_assert(match_attention_with_exec_queue(s->debugger->log, ea));
> +		} else if (event->type == DRM_XE_EUDEBUG_EVENT_PAGEFAULT) {
> +			uint32_t after_offset = bitmask_size / sizeof(uint32_t);
> +			uint32_t resolved_offset = bitmask_size / sizeof(uint32_t) * 2;
> +
> +			pf = (struct drm_xe_eudebug_event_pagefault *)event;
> +			ptr = (uint32_t *) pf->bitmask;
> +			igt_assert_eq(pf->bitmask_size, bitmask_size * 3);
> +			pagefault_threads += eu_attentions_xor_count(ptr + after_offset,
> +								     ptr + resolved_offset,
> +								     bitmask_size);
>   		}
>   	}
>   
> @@ -1279,6 +1362,9 @@ static void online_session_check(struct xe_eudebug_session *s, int flags)
>   		igt_assert(sum > 0);
>   	else
>   		igt_assert(sum == 0);
> +
> +	if (flags & SHADER_PAGEFAULT)
> +		igt_assert(pagefault_threads > 0);
>   }
>   
>   static void ufence_ack_trigger(struct xe_eudebug_debugger *d,
> @@ -1302,6 +1388,55 @@ static void ufence_ack_set_bp_trigger(struct xe_eudebug_debugger *d,
>   	}
>   }
>   
> +static void pagefault_trigger(struct xe_eudebug_debugger *d,
> +			      struct drm_xe_eudebug_event *e)
> +{
> +	struct drm_xe_eudebug_event_pagefault *pf = (void *) e;
> +	int before_threads, after_threads, resolved_threads, pagefault_threads;
> +	uint32_t attn_size = pf->bitmask_size / 3;
> +	uint32_t *ptr = (uint32_t *) pf->bitmask;
> +	uint32_t offset, before_offset = 0;
> +	uint32_t after_offset = attn_size / sizeof(uint32_t);
> +	uint32_t resolved_offset = attn_size / sizeof(uint32_t) * 2;
> +
> +	before_threads = count_set_bits(ptr + before_offset, attn_size);
> +	after_threads = count_set_bits(ptr + after_offset, attn_size);
> +	resolved_threads = count_set_bits(ptr + resolved_offset, attn_size);
> +
> +	pagefault_threads = eu_attentions_xor_count(ptr + after_offset,
> +						    ptr + resolved_offset,
> +						    attn_size);
> +
> +	igt_debug("EVENT[%llu] pagefault; threads[before=%d, after=%d, "
> +		  "resolved=%d, pagefault=%d] "
> +		  "client[%llu], exec_queue[%llu], lrc[%llu], bitmask_size[%d], "
> +		  "pagefault_address[0x%llx]\n",
> +		  pf->base.seqno, before_threads, after_threads, resolved_threads,
> +		  pagefault_threads, pf->client_handle, pf->exec_queue_handle,
> +		  pf->lrc_handle, pf->bitmask_size,
> +		  pf->pagefault_address);
> +
> +	for (int idx = 0; idx < 3; idx++) {
> +		if (idx == 0) {
> +			igt_debug("=== Attentions before ===\n");
> +			offset = before_offset;
> +		} else if (idx == 1) {
> +			igt_debug("=== Attentions after ===\n");
> +			offset = after_offset;
> +		} else {
> +			igt_debug("=== Attentions resolved ===\n");
> +			offset = resolved_offset;
> +		}
> +
> +		for (uint32_t i = 0; i < attn_size / sizeof(uint32_t); i += 2)
> +			igt_debug("bitmask[%d] = 0x%08x%08x\n", i / 2,
> +				  ptr[offset + i], ptr[offset + i + 1]);
> +	}
> +
> +	igt_assert(pagefault_threads > 0);
> +	igt_assert_eq_u64(pf->pagefault_address, BAD_OFFSET);
> +}
> +
>   /**
>    * SUBTEST: basic-breakpoint
>    * Description:
> @@ -1383,6 +1518,77 @@ static void test_set_breakpoint_online(int fd, struct drm_xe_engine_class_instan
>   	online_debug_data_destroy(data);
>   }
>   
> +/**
> + * SUBTEST: pagefault-read
> + * Description:
> + *     Check whether KMD sends pagefault event for workload in debug mode that
> + *     triggers a read pagefault.
> + *
> + * SUBTEST: pagefault-write
> + * Description:
> + *     Check whether KMD sends pagefault event for workload in debug mode that
> + *     triggers a write pagefault.
> + */
> +static void test_pagefault_online(int fd, struct drm_xe_engine_class_instance *hwe,
> +				  int flags)
> +{
> +	struct xe_eudebug_session *s;
> +	struct online_debug_data *data;
> +	uint32_t val;
> +
> +	data = online_debug_data_create(hwe);
> +	s = xe_eudebug_session_create(fd, run_online_client, flags, data);
> +
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_OPEN,
> +					open_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_EXEC_QUEUE,
> +					exec_queue_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_EU_ATTENTION,
> +					eu_attention_debug_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_EU_ATTENTION,
> +					eu_attention_resume_pagefault_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_VM, vm_open_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_METADATA,
> +					create_metadata_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_VM_BIND_UFENCE,
> +					ufence_ack_trigger);
> +	xe_eudebug_debugger_add_trigger(s->debugger, DRM_XE_EUDEBUG_EVENT_PAGEFAULT,
> +					pagefault_trigger);
> +
> +	igt_assert_eq(xe_eudebug_debugger_attach(s->debugger, s->client), 0);
> +	xe_eudebug_debugger_start_worker(s->debugger);
> +	xe_eudebug_client_start(s->client);
> +
> +	/* wait for workload to start */
> +	igt_for_milliseconds(STARTUP_TIMEOUT_MS) {
> +		/* collect needed data from triggers */
> +		if (READ_ONCE(data->vm_fd) == -1 || READ_ONCE(data->target_size) == 0)
> +			continue;
> +
> +		if (pread(data->vm_fd, &val, sizeof(val), data->target_offset) == sizeof(val))
> +			if (val != 0)
> +				break;
> +	}

I think this is redundant. It looks like it copies the 
'test_interrupt_all' function, but as far I'm concerned the waiting loop 
is there to interact with the client thread while the gpu workload is 
running. Since we don't do that here I think 'test_pagefault_online' 
should rather resemble the 'test_basic_online' function.

Thanks,
Christoph

> +
> +	pthread_mutex_lock(&data->mutex);
> +	igt_assert(data->client_handle != -1);
> +	igt_assert(data->exec_queue_handle != -1);
> +
> +	pthread_mutex_unlock(&data->mutex);
> +
> +	xe_eudebug_client_wait_done(s->client);
> +
> +	xe_eudebug_debugger_stop_worker(s->debugger, 1);
> +
> +	xe_eudebug_event_log_print(s->debugger->log, true);
> +	xe_eudebug_event_log_print(s->client->log, true);
> +
> +	online_session_check(s, s->flags);
> +
> +	xe_eudebug_session_destroy(s);
> +	online_debug_data_destroy(data);
> +}
> +
>   /**
>    * SUBTEST: preempt-breakpoint
>    * Description:
> @@ -2344,6 +2550,11 @@ igt_main
>   	igt_subtest("breakpoint-many-sessions-tiles")
>   		test_many_sessions_on_tiles(fd, true);
>   
> +	test_gt_render_or_compute("pagefault-read", fd, hwe)
> +		test_pagefault_online(fd, hwe, SHADER_PAGEFAULT_READ);
> +	test_gt_render_or_compute("pagefault-write", fd, hwe)
> +		test_pagefault_online(fd, hwe, SHADER_PAGEFAULT_WRITE);
> +
>   	igt_fixture {
>   		xe_eudebug_enable(fd, was_enabled);
>   

  parent reply	other threads:[~2024-11-19 16:49 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-15 14:11 [PATCH i-g-t 0/4] tests/intel/xe_eudebug_online: Introduce read/write pagefault tests Gwan-gyeong Mun
2024-11-15 14:11 ` [PATCH i-g-t 1/4] lib/gppgu_shader: Add write to ppgtt offset Gwan-gyeong Mun
2024-11-18 13:00   ` Hajda, Andrzej
2024-11-21 12:01     ` Gwan-gyeong Mun
2024-11-15 14:11 ` [PATCH i-g-t 2/4] lib/gpgpu_shader: Add causing a read pagefault from the eu thread Gwan-gyeong Mun
2024-11-18 13:08   ` Hajda, Andrzej
2024-11-21 12:02     ` Gwan-gyeong Mun
2024-11-19 11:38   ` Manszewski, Christoph
2024-11-21 12:07     ` Gwan-gyeong Mun
2024-11-15 14:11 ` [PATCH i-g-t 3/4] eudebug: Add eudebug pagefault event declarations Gwan-gyeong Mun
2024-11-18 16:52   ` Hajda, Andrzej
2024-11-19  8:50     ` Manszewski, Christoph
2024-11-19 12:26   ` Manszewski, Christoph
2024-11-15 14:11 ` [PATCH i-g-t 4/4] tests/intel/xe_eudebug_online: Add read/write pagefault online tests Gwan-gyeong Mun
2024-11-19  8:10   ` Hajda, Andrzej
2024-11-21 12:06     ` Gwan-gyeong Mun
2024-11-19 15:58   ` Hajda, Andrzej
2024-11-21 12:11     ` Gwan-gyeong Mun
2024-11-19 16:49   ` Manszewski, Christoph [this message]
2024-11-21 12:15     ` Gwan-gyeong Mun
2024-11-15 14:42 ` ✓ CI.xeBAT: success for tests/intel/xe_eudebug_online: Introduce read/write pagefault tests Patchwork
2024-11-15 14:48 ` ✓ Fi.CI.BAT: " Patchwork
2024-11-15 21:24 ` ✗ Fi.CI.IGT: failure " Patchwork
2024-11-15 23:25 ` ✗ CI.xeFULL: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7b6ec309-ed43-4b34-ba5c-694a9c8a715c@intel.com \
    --to=christoph.manszewski@intel.com \
    --cc=dominik.grzegorzek@intel.com \
    --cc=gwan-gyeong.mun@intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=jonathan.cavitt@intel.com \
    --cc=mika.kuoppala@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox