[PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string
@ 2024-11-22  2:53 John.C.Harrison
  2024-11-22  2:53 ` [PATCH v3 1/2] drm/xe: Add a reason string to the devcoredump John.C.Harrison
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: John.C.Harrison @ 2024-11-22  2:53 UTC (permalink / raw)
  To: Intel-Xe; +Cc: John Harrison

From: John Harrison <John.C.Harrison@Intel.com>

There are now many ways to trigger a devcoredump capture. So include a
reason message to say why it happened.

Also, add locking to avoid a races with double captures and free vs read.

v2: Change to spin_lock to mutex and a few other tweaks (review
feedback from Matthew Brost).
v3: Move mutex locks and add reclaim annotation (review feedback from
Matthew Brost).

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>


John Harrison (2):
  drm/xe: Add a reason string to the devcoredump
  drm/xe: Add mutex locking to devcoredump

 drivers/gpu/drm/xe/xe_devcoredump.c       | 53 ++++++++++++++++++++---
 drivers/gpu/drm/xe/xe_devcoredump.h       |  5 ++-
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  8 +++-
 drivers/gpu/drm/xe/xe_guc_submit.c        | 14 ++++--
 4 files changed, 68 insertions(+), 12 deletions(-)

-- 
2.47.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 1/2] drm/xe: Add a reason string to the devcoredump
  2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
@ 2024-11-22  2:53 ` John.C.Harrison
  2024-11-22  2:53 ` [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump John.C.Harrison
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: John.C.Harrison @ 2024-11-22  2:53 UTC (permalink / raw)
  To: Intel-Xe; +Cc: John Harrison, Matthew Brost

From: John Harrison <John.C.Harrison@Intel.com>

There are debug level prints giving more information about the cause
of the hang immediately before core dumps are created. However, not
everyone has debug level prints enabled or saves the dmesg log at all.
So include that information in the dump file itself. Also, at least
one of those prints included the pid as well as the process name. So
include that in the capture too.

v2: Fix kvfree vs kfree and missing kernel-doc (review feedback from
Matthew Brost)
v3: Use GFP_ATOMIC instead of GFP_KERNEL.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 21 ++++++++++++++++++---
 drivers/gpu/drm/xe/xe_devcoredump.h       |  5 +++--
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++++
 drivers/gpu/drm/xe/xe_guc_submit.c        | 14 ++++++++++----
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 0e5edf14a241..f4c77f525819 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -99,6 +99,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
 	p = drm_coredump_printer(&iter);
 
 	drm_puts(&p, "**** Xe Device Coredump ****\n");
+	drm_printf(&p, "Reason: %s\n", ss->reason);
 	drm_puts(&p, "kernel: " UTS_RELEASE "\n");
 	drm_puts(&p, "module: " KBUILD_MODNAME "\n");
 
@@ -106,7 +107,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
 	drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
 	ts = ktime_to_timespec64(ss->boot_time);
 	drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
-	drm_printf(&p, "Process: %s\n", ss->process_name);
+	drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss->pid);
 	xe_device_snapshot_print(xe, &p);
 
 	drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id);
@@ -138,6 +139,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
 {
 	int i;
 
+	kfree(ss->reason);
+	ss->reason = NULL;
+
 	xe_guc_log_snapshot_free(ss->guc.log);
 	ss->guc.log = NULL;
 
@@ -253,8 +257,11 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
 	ss->snapshot_time = ktime_get_real();
 	ss->boot_time = ktime_get_boottime();
 
-	if (q->vm && q->vm->xef)
+	if (q->vm && q->vm->xef) {
 		process_name = q->vm->xef->process_name;
+		ss->pid = q->vm->xef->pid;
+	}
+
 	strscpy(ss->process_name, process_name);
 
 	ss->gt = q->gt;
@@ -292,15 +299,18 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
  * xe_devcoredump - Take the required snapshots and initialize coredump device.
  * @q: The faulty xe_exec_queue, where the issue was detected.
  * @job: The faulty xe_sched_job, where the issue was detected.
+ * @fmt: Printf format + args to describe the reason for the core dump
  *
  * This function should be called at the crash time within the serialized
  * gt_reset. It is skipped if we still have the core dump device available
  * with the information of the 'first' snapshot.
  */
-void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
+__printf(3, 4)
+void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...)
 {
 	struct xe_device *xe = gt_to_xe(q->gt);
 	struct xe_devcoredump *coredump = &xe->devcoredump;
+	va_list varg;
 
 	if (coredump->captured) {
 		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
@@ -308,6 +318,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
 	}
 
 	coredump->captured = true;
+
+	va_start(varg, fmt);
+	coredump->snapshot.reason = kvasprintf(GFP_ATOMIC, fmt, varg);
+	va_end(varg);
+
 	devcoredump_snapshot(coredump, q, job);
 
 	drm_info(&xe->drm, "Xe device coredump has been created\n");
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
index c04a534e3384..6a17e6d60102 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -14,11 +14,12 @@ struct xe_exec_queue;
 struct xe_sched_job;
 
 #ifdef CONFIG_DEV_COREDUMP
-void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job);
+void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...);
 int xe_devcoredump_init(struct xe_device *xe);
 #else
 static inline void xe_devcoredump(struct xe_exec_queue *q,
-				  struct xe_sched_job *job)
+				  struct xe_sched_job *job,
+				  const char *fmt, ...)
 {
 }
 
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index be4d59ea9ac8..e6234e887102 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -28,6 +28,10 @@ struct xe_devcoredump_snapshot {
 	ktime_t boot_time;
 	/** @process_name: Name of process that triggered this gpu hang */
 	char process_name[TASK_COMM_LEN];
+	/** @pid: Process id of process that triggered this gpu hang */
+	pid_t pid;
+	/** @reason: The reason the coredump was triggered */
+	char *reason;
 
 	/** @gt: Affected GT, used by forcewake for delayed capture */
 	struct xe_gt *gt;
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index e132294fde51..7d0e7cb977ad 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -898,7 +898,8 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
 		if (!ret) {
 			xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n",
 				   q->guc->id);
-			xe_devcoredump(q, NULL);
+			xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n",
+				       q->guc->id);
 			xe_sched_submission_start(sched);
 			xe_gt_reset_async(q->gt);
 			return;
@@ -906,7 +907,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
 	}
 
 	if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
-		xe_devcoredump(q, NULL);
+		xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
 
 	xe_sched_submission_start(sched);
 }
@@ -1132,7 +1133,9 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 				xe_gt_warn(guc_to_gt(guc),
 					   "Schedule disable failed to respond, guc_id=%d",
 					   q->guc->id);
-			xe_devcoredump(q, job);
+			xe_devcoredump(q, job,
+				       "Schedule disable failed to respond, guc_id=%d, ret=%d, guc_read=%d",
+				       q->guc->id, ret, xe_guc_read_stopped(guc));
 			set_exec_queue_extra_ref(q);
 			xe_exec_queue_get(q);	/* GT reset owns this */
 			set_exec_queue_banned(q);
@@ -1162,7 +1165,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 	trace_xe_sched_job_timedout(job);
 
 	if (!exec_queue_killed(q))
-		xe_devcoredump(q, job);
+		xe_devcoredump(q, job,
+			       "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+			       xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+			       q->guc->id, q->flags);
 
 	/*
 	 * Kernel jobs should never fail, nor should VM jobs if they do
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump
  2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
  2024-11-22  2:53 ` [PATCH v3 1/2] drm/xe: Add a reason string to the devcoredump John.C.Harrison
@ 2024-11-22  2:53 ` John.C.Harrison
  2024-11-22  4:24   ` Matthew Brost
  2024-11-22  4:13 ` ✓ CI.Patch_applied: success for drm/xe: Add devcoredump locking and reason string (rev3) Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 7+ messages in thread
From: John.C.Harrison @ 2024-11-22  2:53 UTC (permalink / raw)
  To: Intel-Xe; +Cc: John Harrison

From: John Harrison <John.C.Harrison@Intel.com>

There are now multiple places that can trigger a coredump. Some of
which can happen in parallel. There is already a check against
capturing multiple dumps sequentially, but without locking it doesn't
guarantee to work against concurrent dumps. And if two dumps do happen
in parallel, they can end up doing Bad Things such as one call stack
freeing the data the other call stack is still processing. Which leads
to a crashed kernel.

Further, it is possible for the DRM timeout to expire and trigger a
free of the capture while a user is still reading that capture out
through sysfs. Again leading to dodgy pointer problems.

So, add a mutext lock around the capture, read and free functions to
prevent inteference.

v2: Swap tiny scope spin_lock for larger scope mutex and fix
kernel-doc comment (review feedback from Matthew Brost)
v3: Move mutex locks to exclude worker thread and add reclaim
annotation (review feedback from Matthew Brost)

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 32 +++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index f4c77f525819..376583a4a42e 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -207,16 +207,24 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	/* Ensure delayed work is captured before continuing */
 	flush_work(&ss->work);
 
-	if (!ss->read.buffer)
+	mutex_lock(&coredump->lock);
+
+	if (!ss->read.buffer) {
+		mutex_unlock(&coredump->lock);
 		return -ENODEV;
+	}
 
-	if (offset >= ss->read.size)
+	if (offset >= ss->read.size) {
+		mutex_unlock(&coredump->lock);
 		return 0;
+	}
 
 	byte_copied = count < ss->read.size - offset ? count :
 		ss->read.size - offset;
 	memcpy(buffer, ss->read.buffer + offset, byte_copied);
 
+	mutex_unlock(&coredump->lock);
+
 	return byte_copied;
 }
 
@@ -230,6 +238,8 @@ static void xe_devcoredump_free(void *data)
 
 	cancel_work_sync(&coredump->snapshot.work);
 
+	mutex_lock(&coredump->lock);
+
 	xe_devcoredump_snapshot_free(&coredump->snapshot);
 	kvfree(coredump->snapshot.read.buffer);
 
@@ -238,6 +248,8 @@ static void xe_devcoredump_free(void *data)
 	coredump->captured = false;
 	drm_info(&coredump_to_xe(coredump)->drm,
 		 "Xe device coredump has been deleted.\n");
+
+	mutex_unlock(&coredump->lock);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -312,8 +324,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const cha
 	struct xe_devcoredump *coredump = &xe->devcoredump;
 	va_list varg;
 
+	mutex_lock(&coredump->lock);
+
 	if (coredump->captured) {
 		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
+		mutex_unlock(&coredump->lock);
 		return;
 	}
 
@@ -332,6 +347,7 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const cha
 	dev_coredumpm_timeout(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
 			      xe_devcoredump_read, xe_devcoredump_free,
 			      XE_COREDUMP_TIMEOUT_JIFFIES);
+	mutex_unlock(&coredump->lock);
 }
 
 static void xe_driver_devcoredump_fini(void *arg)
@@ -343,6 +359,18 @@ static void xe_driver_devcoredump_fini(void *arg)
 
 int xe_devcoredump_init(struct xe_device *xe)
 {
+	int err;
+
+	err = drmm_mutex_init(&xe->drm, &xe->devcoredump.lock);
+	if (err)
+		return err;
+
+	if (IS_ENABLED(CONFIG_LOCKDEP)) {
+		fs_reclaim_acquire(GFP_KERNEL);
+		might_lock(&xe->devcoredump->lock);
+		fs_reclaim_release(GFP_KERNEL);
+	}
+
 	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index e6234e887102..1a1d16a96b2d 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -80,7 +80,9 @@ struct xe_devcoredump_snapshot {
  * for reading the information.
  */
 struct xe_devcoredump {
-	/** @captured: The snapshot of the first hang has already been taken. */
+	/** @lock: protects access to entire structure */
+	struct mutex lock;
+	/** @captured: The snapshot of the first hang has already been taken */
 	bool captured;
 	/** @snapshot: Snapshot is captured at time of the first crash */
 	struct xe_devcoredump_snapshot snapshot;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* ✓ CI.Patch_applied: success for drm/xe: Add devcoredump locking and reason string (rev3)
  2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
  2024-11-22  2:53 ` [PATCH v3 1/2] drm/xe: Add a reason string to the devcoredump John.C.Harrison
  2024-11-22  2:53 ` [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump John.C.Harrison
@ 2024-11-22  4:13 ` Patchwork
  2024-11-22  4:13 ` ✓ CI.checkpatch: " Patchwork
  2024-11-22  4:14 ` ✗ CI.KUnit: failure " Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-11-22  4:13 UTC (permalink / raw)
  To: john.c.harrison; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Add devcoredump locking and reason string (rev3)
URL   : https://patchwork.freedesktop.org/series/141628/
State : success

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: bb17c42521f5 drm-tip: 2024y-11m-21d-21h-09m-15s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Add a reason string to the devcoredump
Applying: drm/xe: Add mutex locking to devcoredump



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✓ CI.checkpatch: success for drm/xe: Add devcoredump locking and reason string (rev3)
  2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
                   ` (2 preceding siblings ...)
  2024-11-22  4:13 ` ✓ CI.Patch_applied: success for drm/xe: Add devcoredump locking and reason string (rev3) Patchwork
@ 2024-11-22  4:13 ` Patchwork
  2024-11-22  4:14 ` ✗ CI.KUnit: failure " Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-11-22  4:13 UTC (permalink / raw)
  To: john.c.harrison; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Add devcoredump locking and reason string (rev3)
URL   : https://patchwork.freedesktop.org/series/141628/
State : success

== Summary ==

+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
30ab6715fc09baee6cc14cb3c89ad8858688d474
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit bf02b61757b8756796fbb28c9a184677be89b7e7
Author: John Harrison <John.C.Harrison@Intel.com>
Date:   Thu Nov 21 18:53:57 2024 -0800

    drm/xe: Add mutex locking to devcoredump
    
    There are now multiple places that can trigger a coredump. Some of
    which can happen in parallel. There is already a check against
    capturing multiple dumps sequentially, but without locking it doesn't
    guarantee to work against concurrent dumps. And if two dumps do happen
    in parallel, they can end up doing Bad Things such as one call stack
    freeing the data the other call stack is still processing. Which leads
    to a crashed kernel.
    
    Further, it is possible for the DRM timeout to expire and trigger a
    free of the capture while a user is still reading that capture out
    through sysfs. Again leading to dodgy pointer problems.
    
    So, add a mutext lock around the capture, read and free functions to
    prevent inteference.
    
    v2: Swap tiny scope spin_lock for larger scope mutex and fix
    kernel-doc comment (review feedback from Matthew Brost)
    v3: Move mutex locks to exclude worker thread and add reclaim
    annotation (review feedback from Matthew Brost)
    
    Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
+ /mt/dim checkpatch bb17c42521f57b592e9ad49daca1f9f9045d3199 drm-intel
b67615bc7436 drm/xe: Add a reason string to the devcoredump
bf02b61757b8 drm/xe: Add mutex locking to devcoredump



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ CI.KUnit: failure for drm/xe: Add devcoredump locking and reason string (rev3)
  2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
                   ` (3 preceding siblings ...)
  2024-11-22  4:13 ` ✓ CI.checkpatch: " Patchwork
@ 2024-11-22  4:14 ` Patchwork
  4 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-11-22  4:14 UTC (permalink / raw)
  To: john.c.harrison; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Add devcoredump locking and reason string (rev3)
URL   : https://patchwork.freedesktop.org/series/141628/
State : failure

== Summary ==

+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:In file included from ../include/linux/bitops.h:7,
                 from ../include/linux/thread_info.h:27,
                 from ../include/asm-generic/current.h:6,
                 from ./arch/um/include/generated/asm/current.h:1,
                 from ../include/linux/mutex.h:14,
                 from ../drivers/gpu/drm/xe/xe_devcoredump_types.h:10,
                 from ../drivers/gpu/drm/xe/xe_devcoredump.c:7:
../drivers/gpu/drm/xe/xe_devcoredump.c: In function ‘xe_devcoredump_init’:
../drivers/gpu/drm/xe/xe_devcoredump.c:370:44: error: invalid type argument of ‘->’ (have ‘struct xe_devcoredump’)
  370 |                 might_lock(&xe->devcoredump->lock);
      |                                            ^~
../include/linux/typecheck.h:11:16: note: in definition of macro ‘typecheck’
   11 |         typeof(x) __dummy2; \
      |                ^
../drivers/gpu/drm/xe/xe_devcoredump.c:370:17: note: in expansion of macro ‘might_lock’
  370 |                 might_lock(&xe->devcoredump->lock);
      |                 ^~~~~~~~~~
../include/linux/typecheck.h:12:25: warning: comparison of distinct pointer types lacks a cast
   12 |         (void)(&__dummy == &__dummy2); \
      |                         ^~
../include/linux/lockdep.h:551:9: note: in expansion of macro ‘typecheck’
  551 |         typecheck(struct lockdep_map *, &(lock)->dep_map);              \
      |         ^~~~~~~~~
../drivers/gpu/drm/xe/xe_devcoredump.c:370:17: note: in expansion of macro ‘might_lock’
  370 |                 might_lock(&xe->devcoredump->lock);
      |                 ^~~~~~~~~~
In file included from ../include/linux/mutex.h:17:
../drivers/gpu/drm/xe/xe_devcoredump.c:370:44: error: invalid type argument of ‘->’ (have ‘struct xe_devcoredump’)
  370 |                 might_lock(&xe->devcoredump->lock);
      |                                            ^~
../include/linux/lockdep.h:552:24: note: in definition of macro ‘might_lock’
  552 |         lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);    \
      |                        ^~~~
../drivers/gpu/drm/xe/xe_devcoredump.c:370:44: error: invalid type argument of ‘->’ (have ‘struct xe_devcoredump’)
  370 |                 might_lock(&xe->devcoredump->lock);
      |                                            ^~
../include/linux/lockdep.h:553:24: note: in definition of macro ‘might_lock’
  553 |         lock_release(&(lock)->dep_map, _THIS_IP_);                      \
      |                        ^~~~
make[7]: *** [../scripts/Makefile.build:229: drivers/gpu/drm/xe/xe_devcoredump.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:478: drivers/gpu/drm/xe] Error 2
make[6]: *** Waiting for unfinished jobs....
make[5]: *** [../scripts/Makefile.build:478: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:478: drivers/gpu] Error 2
make[4]: *** Waiting for unfinished jobs....
make[3]: *** [../scripts/Makefile.build:478: drivers] Error 2
make[3]: *** Waiting for unfinished jobs....
../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
  156 | u64 ioread64_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
  163 | u64 ioread64_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
  170 | u64 ioread64be_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
  178 | u64 ioread64be_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
  264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
  272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
  280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
  288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
make[2]: *** [/kernel/Makefile:1936: .] Error 2
make[1]: *** [/kernel/Makefile:224: __sub-make] Error 2
make: *** [Makefile:224: __sub-make] Error 2

[04:13:46] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[04:13:51] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump
  2024-11-22  2:53 ` [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump John.C.Harrison
@ 2024-11-22  4:24   ` Matthew Brost
  0 siblings, 0 replies; 7+ messages in thread
From: Matthew Brost @ 2024-11-22  4:24 UTC (permalink / raw)
  To: John.C.Harrison; +Cc: Intel-Xe

On Thu, Nov 21, 2024 at 06:53:57PM -0800, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> There are now multiple places that can trigger a coredump. Some of
> which can happen in parallel. There is already a check against
> capturing multiple dumps sequentially, but without locking it doesn't
> guarantee to work against concurrent dumps. And if two dumps do happen
> in parallel, they can end up doing Bad Things such as one call stack
> freeing the data the other call stack is still processing. Which leads
> to a crashed kernel.
> 
> Further, it is possible for the DRM timeout to expire and trigger a
> free of the capture while a user is still reading that capture out
> through sysfs. Again leading to dodgy pointer problems.
> 
> So, add a mutext lock around the capture, read and free functions to
> prevent inteference.
> 
> v2: Swap tiny scope spin_lock for larger scope mutex and fix
> kernel-doc comment (review feedback from Matthew Brost)
> v3: Move mutex locks to exclude worker thread and add reclaim
> annotation (review feedback from Matthew Brost)
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>

Reviewed-by: Matthew Brost <matthew.brost@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 32 +++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ++-
>  2 files changed, 33 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index f4c77f525819..376583a4a42e 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -207,16 +207,24 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	/* Ensure delayed work is captured before continuing */
>  	flush_work(&ss->work);
>  
> -	if (!ss->read.buffer)
> +	mutex_lock(&coredump->lock);
> +
> +	if (!ss->read.buffer) {
> +		mutex_unlock(&coredump->lock);
>  		return -ENODEV;
> +	}
>  
> -	if (offset >= ss->read.size)
> +	if (offset >= ss->read.size) {
> +		mutex_unlock(&coredump->lock);
>  		return 0;
> +	}
>  
>  	byte_copied = count < ss->read.size - offset ? count :
>  		ss->read.size - offset;
>  	memcpy(buffer, ss->read.buffer + offset, byte_copied);
>  
> +	mutex_unlock(&coredump->lock);
> +
>  	return byte_copied;
>  }
>  
> @@ -230,6 +238,8 @@ static void xe_devcoredump_free(void *data)
>  
>  	cancel_work_sync(&coredump->snapshot.work);
>  
> +	mutex_lock(&coredump->lock);
> +
>  	xe_devcoredump_snapshot_free(&coredump->snapshot);
>  	kvfree(coredump->snapshot.read.buffer);
>  
> @@ -238,6 +248,8 @@ static void xe_devcoredump_free(void *data)
>  	coredump->captured = false;
>  	drm_info(&coredump_to_xe(coredump)->drm,
>  		 "Xe device coredump has been deleted.\n");
> +
> +	mutex_unlock(&coredump->lock);
>  }
>  
>  static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -312,8 +324,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const cha
>  	struct xe_devcoredump *coredump = &xe->devcoredump;
>  	va_list varg;
>  
> +	mutex_lock(&coredump->lock);
> +
>  	if (coredump->captured) {
>  		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
> +		mutex_unlock(&coredump->lock);
>  		return;
>  	}
>  
> @@ -332,6 +347,7 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const cha
>  	dev_coredumpm_timeout(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
>  			      xe_devcoredump_read, xe_devcoredump_free,
>  			      XE_COREDUMP_TIMEOUT_JIFFIES);
> +	mutex_unlock(&coredump->lock);
>  }
>  
>  static void xe_driver_devcoredump_fini(void *arg)
> @@ -343,6 +359,18 @@ static void xe_driver_devcoredump_fini(void *arg)
>  
>  int xe_devcoredump_init(struct xe_device *xe)
>  {
> +	int err;
> +
> +	err = drmm_mutex_init(&xe->drm, &xe->devcoredump.lock);
> +	if (err)
> +		return err;
> +
> +	if (IS_ENABLED(CONFIG_LOCKDEP)) {
> +		fs_reclaim_acquire(GFP_KERNEL);
> +		might_lock(&xe->devcoredump->lock);
> +		fs_reclaim_release(GFP_KERNEL);
> +	}
> +
>  	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index e6234e887102..1a1d16a96b2d 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -80,7 +80,9 @@ struct xe_devcoredump_snapshot {
>   * for reading the information.
>   */
>  struct xe_devcoredump {
> -	/** @captured: The snapshot of the first hang has already been taken. */
> +	/** @lock: protects access to entire structure */
> +	struct mutex lock;
> +	/** @captured: The snapshot of the first hang has already been taken */
>  	bool captured;
>  	/** @snapshot: Snapshot is captured at time of the first crash */
>  	struct xe_devcoredump_snapshot snapshot;
> -- 
> 2.47.0
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-11-22  4:24 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-22  2:53 [PATCH v3 0/2] drm/xe: Add devcoredump locking and reason string John.C.Harrison
2024-11-22  2:53 ` [PATCH v3 1/2] drm/xe: Add a reason string to the devcoredump John.C.Harrison
2024-11-22  2:53 ` [PATCH v3 2/2] drm/xe: Add mutex locking to devcoredump John.C.Harrison
2024-11-22  4:24   ` Matthew Brost
2024-11-22  4:13 ` ✓ CI.Patch_applied: success for drm/xe: Add devcoredump locking and reason string (rev3) Patchwork
2024-11-22  4:13 ` ✓ CI.checkpatch: " Patchwork
2024-11-22  4:14 ` ✗ CI.KUnit: failure " Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox