From: Matthew Auld <matthew.auld@intel.com>
To: Matthew Brost <matthew.brost@intel.com>, intel-xe@lists.freedesktop.org
Subject: Re: [PATCH 1/1] drm/xe: Store process name and pid in xe file
Date: Tue, 23 Jul 2024 11:07:04 +0100 [thread overview]
Message-ID: <d430a711-1e0c-4d49-81e8-bb34d7e831e5@intel.com> (raw)
In-Reply-To: <20240723042428.1701998-2-matthew.brost@intel.com>
On 23/07/2024 05:24, Matthew Brost wrote:
> An xe file can outlive the associated process as the GPU cleanup is just
> triggered upon file close (process kill) and completes sometime later.
> If the file close triggers error conditions (GPU hangs) the process
> cannot be safely referenced to retrieve the name and pid for debug
> information. Store the process name and pid directly in the xe file to
> be safe.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Also if you look at drm_file_update_pid(), things look pretty scary, so
this sounds very sensible to me.
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 10 ++--------
> drivers/gpu/drm/xe/xe_device.c | 9 +++++++++
> drivers/gpu/drm/xe/xe_device_types.h | 12 ++++++++++++
> drivers/gpu/drm/xe/xe_guc_submit.c | 10 ++--------
> 4 files changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 62c2b10fbf1d..d8d8ca2c19d3 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> u32 adj_logical_mask = q->logical_mask;
> u32 width_mask = (0x1 << q->width) - 1;
> const char *process_name = "no process";
> - struct task_struct *task = NULL;
>
> int i;
> bool cookie;
> @@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> ss->snapshot_time = ktime_get_real();
> ss->boot_time = ktime_get_boottime();
>
> - if (q->vm && q->vm->xef) {
> - task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> - if (task)
> - process_name = task->comm;
> - }
> + if (q->vm && q->vm->xef)
> + process_name = q->vm->xef->process_name;
> strscpy(ss->process_name, process_name);
> - if (task)
> - put_task_struct(task);
>
> ss->gt = q->gt;
> INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index b677608eb592..5a7b66703aa1 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> struct xe_drm_client *client;
> struct xe_file *xef;
> int ret = -ENOMEM;
> + struct task_struct *task = NULL;
>
> xef = kzalloc(sizeof(*xef), GFP_KERNEL);
> if (!xef)
> @@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
> file->driver_priv = xef;
> kref_init(&xef->refcount);
>
> + task = get_pid_task(file->pid, PIDTYPE_PID);
We should probably access file->pid with rcu_access_pointer() here. In
practice it shouldn't really matter here, but the pointer is annotated
with __rcu so we should respect that.
Otherwise,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> + if (task) {
> + xef->process_name = kstrdup(task->comm, GFP_KERNEL);
> + xef->pid = task->pid;
> + put_task_struct(task);
> + }
> +
> return 0;
> }
>
> @@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
> spin_unlock(&xe->clients.lock);
>
> xe_drm_client_put(xef->client);
> + kfree(xef->process_name);
> kfree(xef);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 36252d5b1663..5b7292a9a66d 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -582,6 +582,18 @@ struct xe_file {
> /** @client: drm client */
> struct xe_drm_client *client;
>
> + /**
> + * @process_name: process name for file handle, used to safely output
> + * during error situations where xe file can outlive process
> + */
> + char *process_name;
> +
> + /**
> + * @pid: pid for file handle, used to safely output uring error
> + * situations where xe file can outlive process
> + */
> + pid_t pid;
> +
> /** @refcount: ref count of this xe file */
> struct kref refcount;
> };
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index da2ead86b9ae..a4570631926f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> struct xe_gpu_scheduler *sched = &q->guc->sched;
> struct xe_guc *guc = exec_queue_to_guc(q);
> const char *process_name = "no process";
> - struct task_struct *task = NULL;
> int err = -ETIME;
> pid_t pid = -1;
> int i = 0;
> @@ -1172,17 +1171,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> }
>
> if (q->vm && q->vm->xef) {
> - task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
> - if (task) {
> - process_name = task->comm;
> - pid = task->pid;
> - }
> + process_name = q->vm->xef->process_name;
> + pid = q->vm->xef->pid;
> }
> xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
> xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> q->guc->id, q->flags, process_name, pid);
> - if (task)
> - put_task_struct(task);
>
> trace_xe_sched_job_timedout(job);
>
next prev parent reply other threads:[~2024-07-23 10:07 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-23 4:24 [PATCH 0/1] Store process name and pid in xe file Matthew Brost
2024-07-23 4:24 ` [PATCH 1/1] drm/xe: " Matthew Brost
2024-07-23 10:07 ` Matthew Auld [this message]
2024-07-23 4:28 ` ✓ CI.Patch_applied: success for " Patchwork
2024-07-23 4:28 ` ✓ CI.checkpatch: " Patchwork
2024-07-23 4:29 ` ✓ CI.KUnit: " Patchwork
2024-07-23 4:42 ` ✓ CI.Build: " Patchwork
2024-07-23 4:44 ` ✓ CI.Hooks: " Patchwork
2024-07-23 4:47 ` ✓ CI.checksparse: " Patchwork
2024-07-23 5:12 ` ✓ CI.BAT: " Patchwork
2024-07-23 6:10 ` ✗ CI.FULL: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d430a711-1e0c-4d49-81e8-bb34d7e831e5@intel.com \
--to=matthew.auld@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.brost@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox