All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: John Harrison <john.c.harrison@intel.com>
Cc: "José Roberto de Souza" <jose.souza@intel.com>,
	intel-xe@lists.freedesktop.org
Subject: Re: [PATCH] drm/xe: Nuke simple error capture
Date: Thu, 23 May 2024 13:38:49 -0400	[thread overview]
Message-ID: <Zk9_KenRCyXr2KII@intel.com> (raw)
In-Reply-To: <dccadb04-7aa1-428f-9d6b-4bc0f328ea3d@intel.com>

On Wed, May 22, 2024 at 05:35:51PM -0700, John Harrison wrote:
> On 5/22/2024 13:34, José Roberto de Souza wrote:
> > This error capture prints into dmesg HW state when a gpu hang happens.
> > It was useful when we did not had devcoredump, now it is a incompleted
> > version of devcoredump that has potential to flood dmesg.
> > 
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: John Harrison <John.C.Harrison@Intel.com>
> > Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
> Reviewed-by: John Harrison <John.C.Harrison@Intel.com>

thank you both, pushed to drm-xe-next

> 
> > ---
> >   drivers/gpu/drm/xe/Kconfig.debug   | 10 ------
> >   drivers/gpu/drm/xe/xe_guc_submit.c | 53 +-----------------------------
> >   drivers/gpu/drm/xe/xe_vm.c         | 49 ---------------------------
> >   drivers/gpu/drm/xe/xe_vm.h         |  2 --
> >   4 files changed, 1 insertion(+), 113 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/Kconfig.debug b/drivers/gpu/drm/xe/Kconfig.debug
> > index df02e5d17d26f..bc177368af6c3 100644
> > --- a/drivers/gpu/drm/xe/Kconfig.debug
> > +++ b/drivers/gpu/drm/xe/Kconfig.debug
> > @@ -61,16 +61,6 @@ config DRM_XE_DEBUG_MEM
> >   	  If in doubt, say "N".
> > -config DRM_XE_SIMPLE_ERROR_CAPTURE
> > -	bool "Enable simple error capture to dmesg on job timeout"
> > -	default n
> > -	help
> > -	  Choose this option when debugging an unexpected job timeout
> > -
> > -	  Recommended for driver developers only.
> > -
> > -	  If in doubt, say "N".
> > -
> >   config DRM_XE_KUNIT_TEST
> >           tristate "KUnit tests for the drm xe driver" if !KUNIT_ALL_TESTS
> >   	depends on DRM_XE && KUNIT && DEBUG_FS
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 23f73577facf6..f0a5215159003 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -815,55 +815,6 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
> >   		       G2H_LEN_DW_DEREGISTER_CONTEXT, 2);
> >   }
> > -static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p);
> > -
> > -#if IS_ENABLED(CONFIG_DRM_XE_SIMPLE_ERROR_CAPTURE)
> > -static void simple_error_capture(struct xe_exec_queue *q)
> > -{
> > -	struct xe_guc *guc = exec_queue_to_guc(q);
> > -	struct xe_device *xe = guc_to_xe(guc);
> > -	struct drm_printer p = drm_err_printer(&xe->drm, NULL);
> > -	struct xe_hw_engine *hwe;
> > -	enum xe_hw_engine_id id;
> > -	u32 adj_logical_mask = q->logical_mask;
> > -	u32 width_mask = (0x1 << q->width) - 1;
> > -	int i;
> > -	bool cookie;
> > -
> > -	if (q->vm && !q->vm->error_capture.capture_once) {
> > -		q->vm->error_capture.capture_once = true;
> > -		cookie = dma_fence_begin_signalling();
> > -		for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
> > -			if (adj_logical_mask & BIT(i)) {
> > -				adj_logical_mask |= width_mask << i;
> > -				i += q->width;
> > -			} else {
> > -				++i;
> > -			}
> > -		}
> > -
> > -		if (xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL))
> > -			xe_gt_info(guc_to_gt(guc),
> > -				   "failed to get forcewake for error capture");
> > -		xe_guc_ct_print(&guc->ct, &p, true);
> > -		guc_exec_queue_print(q, &p);
> > -		for_each_hw_engine(hwe, guc_to_gt(guc), id) {
> > -			if (hwe->class != q->hwe->class ||
> > -			    !(BIT(hwe->logical_instance) & adj_logical_mask))
> > -				continue;
> > -			xe_hw_engine_print(hwe, &p);
> > -		}
> > -		xe_analyze_vm(&p, q->vm, q->gt->info.id);
> > -		xe_force_wake_put(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL);
> > -		dma_fence_end_signalling(cookie);
> > -	}
> > -}
> > -#else
> > -static void simple_error_capture(struct xe_exec_queue *q)
> > -{
> > -}
> > -#endif
> > -
> >   static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> >   {
> >   	struct xe_guc *guc = exec_queue_to_guc(q);
> > @@ -995,10 +946,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> >   	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
> >   		   "VM job timed out on non-killed execqueue\n");
> > -	if (!exec_queue_killed(q)) {
> > -		simple_error_capture(q);
> > +	if (!exec_queue_killed(q))
> >   		xe_devcoredump(job);
> > -	}
> >   	trace_xe_sched_job_timedout(job);
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index c5b1694b292fc..53f196cf447b4 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -3389,55 +3389,6 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
> >   	return 0;
> >   }
> > -int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
> > -{
> > -	struct drm_gpuva *gpuva;
> > -	bool is_vram;
> > -	uint64_t addr;
> > -
> > -	if (!down_read_trylock(&vm->lock)) {
> > -		drm_printf(p, " Failed to acquire VM lock to dump capture");
> > -		return 0;
> > -	}
> > -	if (vm->pt_root[gt_id]) {
> > -		addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
> > -		is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
> > -		drm_printf(p, " VM root: A:0x%llx %s\n", addr,
> > -			   is_vram ? "VRAM" : "SYS");
> > -	}
> > -
> > -	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
> > -		struct xe_vma *vma = gpuva_to_vma(gpuva);
> > -		bool is_userptr = xe_vma_is_userptr(vma);
> > -		bool is_null = xe_vma_is_null(vma);
> > -
> > -		if (is_null) {
> > -			addr = 0;
> > -		} else if (is_userptr) {
> > -			struct sg_table *sg = to_userptr_vma(vma)->userptr.sg;
> > -			struct xe_res_cursor cur;
> > -
> > -			if (sg) {
> > -				xe_res_first_sg(sg, 0, XE_PAGE_SIZE, &cur);
> > -				addr = xe_res_dma(&cur);
> > -			} else {
> > -				addr = 0;
> > -			}
> > -		} else {
> > -			addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
> > -			is_vram = xe_bo_is_vram(xe_vma_bo(vma));
> > -		}
> > -		drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
> > -			   xe_vma_start(vma), xe_vma_end(vma) - 1,
> > -			   xe_vma_size(vma),
> > -			   addr, is_null ? "NULL" : is_userptr ? "USR" :
> > -			   is_vram ? "VRAM" : "SYS");
> > -	}
> > -	up_read(&vm->lock);
> > -
> > -	return 0;
> > -}
> > -
> >   struct xe_vm_snapshot {
> >   	unsigned long num_snaps;
> >   	struct {
> > diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> > index 3ac9021f970e3..b481608b12f1b 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.h
> > +++ b/drivers/gpu/drm/xe/xe_vm.h
> > @@ -243,8 +243,6 @@ int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma);
> >   bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
> > -int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id);
> > -
> >   int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
> >   int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
> 

  reply	other threads:[~2024-05-23 17:39 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-22 20:34 [PATCH] drm/xe: Nuke simple error capture José Roberto de Souza
2024-05-22 21:17 ` ✓ CI.Patch_applied: success for " Patchwork
2024-05-22 21:18 ` ✓ CI.checkpatch: " Patchwork
2024-05-22 21:19 ` ✓ CI.KUnit: " Patchwork
2024-05-22 21:30 ` ✓ CI.Build: " Patchwork
2024-05-22 21:33 ` ✓ CI.Hooks: " Patchwork
2024-05-22 21:34 ` ✓ CI.checksparse: " Patchwork
2024-05-22 22:24 ` ✗ CI.BAT: failure " Patchwork
2024-05-23  0:35 ` [PATCH] " John Harrison
2024-05-23 17:38   ` Rodrigo Vivi [this message]
2024-05-23  0:52 ` ✗ CI.FULL: failure for " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zk9_KenRCyXr2KII@intel.com \
    --to=rodrigo.vivi@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=john.c.harrison@intel.com \
    --cc=jose.souza@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.