Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Matthew Auld <matthew.auld@intel.com>
To: Matthew Brost <matthew.brost@intel.com>, intel-xe@lists.freedesktop.org
Subject: Re: [PATCH 1/1] drm/xe: Store process name and pid in xe file
Date: Tue, 23 Jul 2024 11:07:04 +0100	[thread overview]
Message-ID: <d430a711-1e0c-4d49-81e8-bb34d7e831e5@intel.com> (raw)
In-Reply-To: <20240723042428.1701998-2-matthew.brost@intel.com>

On 23/07/2024 05:24, Matthew Brost wrote:
> An xe file can outlive the associated process as the GPU cleanup is just
> triggered upon file close (process kill) and completes sometime later.
> If the file close triggers error conditions (GPU hangs) the process
> cannot be safely referenced to retrieve the name and pid for debug
> information. Store the process name and pid directly in the xe file to
> be safe.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

Also if you look at drm_file_update_pid(), things look pretty scary, so 
this sounds very sensible to me.

> ---
>   drivers/gpu/drm/xe/xe_devcoredump.c  | 10 ++--------
>   drivers/gpu/drm/xe/xe_device.c       |  9 +++++++++
>   drivers/gpu/drm/xe/xe_device_types.h | 12 ++++++++++++
>   drivers/gpu/drm/xe/xe_guc_submit.c   | 10 ++--------
>   4 files changed, 25 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 62c2b10fbf1d..d8d8ca2c19d3 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>   	u32 adj_logical_mask = q->logical_mask;
>   	u32 width_mask = (0x1 << q->width) - 1;
>   	const char *process_name = "no process";
> -	struct task_struct *task = NULL;
>   
>   	int i;
>   	bool cookie;
> @@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>   	ss->snapshot_time = ktime_get_real();
>   	ss->boot_time = ktime_get_boottime();
>   
> -	if (q->vm && q->vm->xef) {
> -		task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> -		if (task)
> -			process_name = task->comm;
> -	}
> +	if (q->vm && q->vm->xef)
> +		process_name = q->vm->xef->process_name;
>   	strscpy(ss->process_name, process_name);
> -	if (task)
> -		put_task_struct(task);
>   
>   	ss->gt = q->gt;
>   	INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index b677608eb592..5a7b66703aa1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>   	struct xe_drm_client *client;
>   	struct xe_file *xef;
>   	int ret = -ENOMEM;
> +	struct task_struct *task = NULL;
>   
>   	xef = kzalloc(sizeof(*xef), GFP_KERNEL);
>   	if (!xef)
> @@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>   	file->driver_priv = xef;
>   	kref_init(&xef->refcount);
>   
> +	task = get_pid_task(file->pid, PIDTYPE_PID);

We should probably access file->pid with rcu_access_pointer() here. In 
practice it shouldn't really matter here, but the pointer is annotated 
with __rcu so we should respect that.

Otherwise,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>

> +	if (task) {
> +		xef->process_name = kstrdup(task->comm, GFP_KERNEL);
> +		xef->pid = task->pid;
> +		put_task_struct(task);
> +	}
> +
>   	return 0;
>   }
>   
> @@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
>   	spin_unlock(&xe->clients.lock);
>   
>   	xe_drm_client_put(xef->client);
> +	kfree(xef->process_name);
>   	kfree(xef);
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 36252d5b1663..5b7292a9a66d 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -582,6 +582,18 @@ struct xe_file {
>   	/** @client: drm client */
>   	struct xe_drm_client *client;
>   
> +	/**
> +	 * @process_name: process name for file handle, used to safely output
> +	 * during error situations where xe file can outlive process
> +	 */
> +	char *process_name;
> +
> +	/**
> +	 * @pid: pid for file handle, used to safely output uring error
> +	 * situations where xe file can outlive process
> +	 */
> +	pid_t pid;
> +
>   	/** @refcount: ref count of this xe file */
>   	struct kref refcount;
>   };
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index da2ead86b9ae..a4570631926f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	struct xe_gpu_scheduler *sched = &q->guc->sched;
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   	const char *process_name = "no process";
> -	struct task_struct *task = NULL;
>   	int err = -ETIME;
>   	pid_t pid = -1;
>   	int i = 0;
> @@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	}
>   
>   	if (q->vm && q->vm->xef) {
> -		task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> -		if (task) {
> -			process_name = task->comm;
> -			pid = task->pid;
> -		}
> +		process_name = q->vm->xef->process_name;
> +		pid = q->vm->xef->pid;
>   	}
>   	xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
>   		     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>   		     q->guc->id, q->flags, process_name, pid);
> -	if (task)
> -		put_task_struct(task);
>   
>   	trace_xe_sched_job_timedout(job);
>   

  reply	other threads:[~2024-07-23 10:07 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-23  4:24 [PATCH 0/1] Store process name and pid in xe file Matthew Brost
2024-07-23  4:24 ` [PATCH 1/1] drm/xe: " Matthew Brost
2024-07-23 10:07   ` Matthew Auld [this message]
2024-07-23  4:28 ` ✓ CI.Patch_applied: success for " Patchwork
2024-07-23  4:28 ` ✓ CI.checkpatch: " Patchwork
2024-07-23  4:29 ` ✓ CI.KUnit: " Patchwork
2024-07-23  4:42 ` ✓ CI.Build: " Patchwork
2024-07-23  4:44 ` ✓ CI.Hooks: " Patchwork
2024-07-23  4:47 ` ✓ CI.checksparse: " Patchwork
2024-07-23  5:12 ` ✓ CI.BAT: " Patchwork
2024-07-23  6:10 ` ✗ CI.FULL: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d430a711-1e0c-4d49-81e8-bb34d7e831e5@intel.com \
    --to=matthew.auld@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox