* [PATCH v2 1/6] drm/xe: Add LRC ctx timestamp support functions
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 22:03 ` [PATCH v2 2/6] drm/xe: Add MI_COPY_MEM_MEM GPU instruction definitions Matthew Brost
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
LRC ctx timestamp support functions are used to determine how long a job
has run on the hardware.
v2:
- Don't use static inlines (Jani)
- Kernel doc
- s/ctx_timestamp_job/ctx_job_timestamp
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_lrc.c | 66 +++++++++++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_lrc.h | 5 +++
2 files changed, 71 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index c1bb85d2e243..0fef354c6489 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -652,6 +652,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
#define LRC_SEQNO_PPHWSP_OFFSET 512
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
+#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
#define LRC_PPHWSP_SIZE SZ_4K
@@ -680,6 +681,12 @@ static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
}
+static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
+{
+ /* The start seqno is stored in the driver-defined portion of PPHWSP */
+ return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
+}
+
static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
{
/* The parallel is stored in the driver-defined portion of PPHWSP */
@@ -691,6 +698,11 @@ static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
}
+static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
+{
+ return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
+}
+
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
{
/* Indirect ring state page is at the very end of LRC */
@@ -716,11 +728,65 @@ DECL_MAP_ADDR_HELPERS(pphwsp)
DECL_MAP_ADDR_HELPERS(seqno)
DECL_MAP_ADDR_HELPERS(regs)
DECL_MAP_ADDR_HELPERS(start_seqno)
+DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
+DECL_MAP_ADDR_HELPERS(ctx_timestamp)
DECL_MAP_ADDR_HELPERS(parallel)
DECL_MAP_ADDR_HELPERS(indirect_ring)
#undef DECL_MAP_ADDR_HELPERS
+/**
+ * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp GGTT address
+ */
+u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+}
+
+/**
+ * xe_lrc_ctx_timestamp_addr() - Read ctx timestamp value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp value
+ */
+u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_ctx_timestamp_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
+/**
+ * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp job GGTT address
+ */
+u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
+}
+
+/**
+ * xe_lrc_ctx_job_timestamp_addr() - Read ctx job timestamp value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp job value
+ */
+u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_ctx_job_timestamp_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_pphwsp_ggtt_addr(lrc);
diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
index 882c3437ba5c..001af6c79454 100644
--- a/drivers/gpu/drm/xe/xe_lrc.h
+++ b/drivers/gpu/drm/xe/xe_lrc.h
@@ -94,6 +94,11 @@ void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot);
void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p);
void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot);
+u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc);
+u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc);
+u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc);
+u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc);
+
/**
* xe_lrc_update_timestamp - readout LRC timestamp and update cached value
* @lrc: logical ring context for this exec queue
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 2/6] drm/xe: Add MI_COPY_MEM_MEM GPU instruction definitions
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
2024-06-07 22:03 ` [PATCH v2 1/6] drm/xe: Add LRC ctx timestamp support functions Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 22:03 ` [PATCH v2 3/6] drm/xe: Emit ctx timestamp copy in ring ops Matthew Brost
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
MI_COPY_MEM_MEM GPU instructions are used to copy ctx timestamp from a
LRC registers to another location at the beginning of every jobs
execution. Add MI_COPY_MEM_MEM GPU instruction definitions.
v2:
- Include MI_COPY_MEM_MEM based on instruction order (Michal)
- Fix tabs/spaces issue (Michal)
- Use macro for DW definition (Michal)
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/instructions/xe_mi_commands.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
index c74ceb550dce..b7bf99dd4848 100644
--- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
+++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
@@ -59,6 +59,10 @@
#define MI_LOAD_REGISTER_MEM (__MI_INSTR(0x29) | XE_INSTR_NUM_DW(4))
#define MI_LRM_USE_GGTT REG_BIT(22)
+#define MI_COPY_MEM_MEM (__MI_INSTR(0x2e) | XE_INSTR_NUM_DW(5))
+#define MI_COPY_MEM_MEM_SRC_GGTT REG_BIT(22)
+#define MI_COPY_MEM_MEM_DST_GGTT REG_BIT(21)
+
#define MI_BATCH_BUFFER_START __MI_INSTR(0x31)
#endif
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 3/6] drm/xe: Emit ctx timestamp copy in ring ops
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
2024-06-07 22:03 ` [PATCH v2 1/6] drm/xe: Add LRC ctx timestamp support functions Matthew Brost
2024-06-07 22:03 ` [PATCH v2 2/6] drm/xe: Add MI_COPY_MEM_MEM GPU instruction definitions Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 22:03 ` [PATCH v2 4/6] drm/xe: Add ctx timestamp to LRC snapshot Matthew Brost
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
Copy ctx timestamp at beginning of every GPU job to a saved location.
Used to determine how long a job has been running on the hardware.
v2:
- - s/ctx_timestamp_job/ctx_job_timestamp
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_ring_ops.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index db630d27beba..0be4f489d3e1 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -224,6 +224,19 @@ static u32 get_ppgtt_flag(struct xe_sched_job *job)
return job->q->vm ? BIT(8) : 0;
}
+static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
+{
+ dw[i++] = MI_COPY_MEM_MEM | MI_COPY_MEM_MEM_SRC_GGTT |
+ MI_COPY_MEM_MEM_DST_GGTT;
+ dw[i++] = xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
+ dw[i++] = 0;
+ dw[i++] = xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+ dw[i++] = 0;
+ dw[i++] = MI_NOOP;
+
+ return i;
+}
+
/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
u64 batch_addr, u32 seqno)
@@ -232,6 +245,8 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
u32 ppgtt_flag = get_ppgtt_flag(job);
struct xe_gt *gt = job->q->gt;
+ i = emit_copy_timestamp(lrc, dw, i);
+
if (job->ring_ops_flush_tlb) {
dw[i++] = preparser_disable(true);
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
@@ -283,6 +298,8 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
struct xe_device *xe = gt_to_xe(gt);
bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
+ i = emit_copy_timestamp(lrc, dw, i);
+
dw[i++] = preparser_disable(true);
/* hsdes: 1809175790 */
@@ -332,6 +349,8 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
u32 mask_flags = 0;
+ i = emit_copy_timestamp(lrc, dw, i);
+
dw[i++] = preparser_disable(true);
if (lacks_render)
mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS;
@@ -375,6 +394,8 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
{
u32 dw[MAX_JOB_SIZE_DW], i = 0;
+ i = emit_copy_timestamp(lrc, dw, i);
+
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, dw, i);
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 4/6] drm/xe: Add ctx timestamp to LRC snapshot
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
` (2 preceding siblings ...)
2024-06-07 22:03 ` [PATCH v2 3/6] drm/xe: Emit ctx timestamp copy in ring ops Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 22:03 ` [PATCH v2 5/6] drm/xe: Add xe_gt_clock_interval_to_ms helper Matthew Brost
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
The ctx timestamp is useful information, add to LRC snapshot.
v2:
- s/ctx_timestamp_job/ctx_job_timestamp
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_lrc.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index 0fef354c6489..f3d77aa1750d 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -49,6 +49,8 @@ struct xe_lrc_snapshot {
} tail;
u32 start_seqno;
u32 seqno;
+ u32 ctx_timestamp;
+ u32 ctx_job_timestamp;
};
static struct xe_device *
@@ -1642,6 +1644,8 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
snapshot->lrc_snapshot = NULL;
+ snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
+ snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
return snapshot;
}
@@ -1690,6 +1694,8 @@ void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer
snapshot->tail.internal, snapshot->tail.memory);
drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
+ drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
+ drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
if (!snapshot->lrc_snapshot)
return;
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 5/6] drm/xe: Add xe_gt_clock_interval_to_ms helper
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
` (3 preceding siblings ...)
2024-06-07 22:03 ` [PATCH v2 4/6] drm/xe: Add ctx timestamp to LRC snapshot Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 22:03 ` [PATCH v2 6/6] drm/xe: Sample ctx timestamp to determine if jobs have timed out Matthew Brost
2024-06-07 23:27 ` ✗ CI.Patch_applied: failure for Only timeout jobs if they run longer than timeout period (rev2) Patchwork
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
Add helper to convert GT clock ticks to msec. Useful for determining if
timeouts occur by examing GT clock ticks.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_gt_clock.c | 18 ++++++++++++++++++
drivers/gpu/drm/xe/xe_gt_clock.h | 1 +
2 files changed, 19 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c
index 9ff2061133df..a9392a743fd5 100644
--- a/drivers/gpu/drm/xe/xe_gt_clock.c
+++ b/drivers/gpu/drm/xe/xe_gt_clock.c
@@ -79,3 +79,21 @@ int xe_gt_clock_init(struct xe_gt *gt)
gt->info.reference_clock = freq;
return 0;
}
+
+static u64 div_u64_roundup(u64 nom, u32 den)
+{
+ return div_u64(nom + den - 1, den);
+}
+
+/**
+ * xe_gt_clock_interval_to_ms - Convert sampled GT clock ticks to msec
+ *
+ * @gt: the &xe_gt
+ * @count: count of GT clock ticks
+ *
+ * Returns: time in msec
+ */
+u64 xe_gt_clock_interval_to_ms(struct xe_gt *gt, u64 count)
+{
+ return div_u64_roundup(count * MSEC_PER_SEC, gt->info.reference_clock);
+}
diff --git a/drivers/gpu/drm/xe/xe_gt_clock.h b/drivers/gpu/drm/xe/xe_gt_clock.h
index 44fa0371b973..3adeb7baaca4 100644
--- a/drivers/gpu/drm/xe/xe_gt_clock.h
+++ b/drivers/gpu/drm/xe/xe_gt_clock.h
@@ -11,5 +11,6 @@
struct xe_gt;
int xe_gt_clock_init(struct xe_gt *gt);
+u64 xe_gt_clock_interval_to_ms(struct xe_gt *gt, u64 count);
#endif
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 6/6] drm/xe: Sample ctx timestamp to determine if jobs have timed out
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
` (4 preceding siblings ...)
2024-06-07 22:03 ` [PATCH v2 5/6] drm/xe: Add xe_gt_clock_interval_to_ms helper Matthew Brost
@ 2024-06-07 22:03 ` Matthew Brost
2024-06-07 23:27 ` ✗ CI.Patch_applied: failure for Only timeout jobs if they run longer than timeout period (rev2) Patchwork
6 siblings, 0 replies; 8+ messages in thread
From: Matthew Brost @ 2024-06-07 22:03 UTC (permalink / raw)
To: intel-xe
In GuC TDR sample ctx timestamp to determine if jobs have timed out. The
scheduling enable needs to be toggled to properly sample the timestamp.
If a job has not been running for longer than the timeout period,
re-enable scheduling and restart the TDR.
v2:
- Use GT clock to msec helper (Umesh, off list)
- s/ctx_timestamp_job/ctx_job_timestamp
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
drivers/gpu/drm/xe/xe_guc_submit.c | 153 ++++++++++++++++++++++-------
1 file changed, 120 insertions(+), 33 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 47aab04cf34f..95ac7aaec70a 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -23,6 +23,7 @@
#include "xe_force_wake.h"
#include "xe_gpu_scheduler.h"
#include "xe_gt.h"
+#include "xe_gt_clock.h"
#include "xe_gt_printk.h"
#include "xe_guc.h"
#include "xe_guc_ct.h"
@@ -61,6 +62,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
#define EXEC_QUEUE_STATE_RESET (1 << 6)
#define EXEC_QUEUE_STATE_KILLED (1 << 7)
#define EXEC_QUEUE_STATE_WEDGED (1 << 8)
+#define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 9)
static bool exec_queue_registered(struct xe_exec_queue *q)
{
@@ -187,6 +189,21 @@ static void set_exec_queue_wedged(struct xe_exec_queue *q)
atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
}
+static bool exec_queue_check_timeout(struct xe_exec_queue *q)
+{
+ return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_CHECK_TIMEOUT;
+}
+
+static void set_exec_queue_check_timeout(struct xe_exec_queue *q)
+{
+ atomic_or(EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
+}
+
+static void clear_exec_queue_check_timeout(struct xe_exec_queue *q)
+{
+ atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
+}
+
static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
{
return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
@@ -918,6 +935,41 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
xe_sched_submission_start(sched);
}
+#define ADJUST_FIVE_PERCENT(__t) (((__t) * 105) / 100)
+
+static bool check_timeout(struct xe_exec_queue *q)
+{
+ struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
+ u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
+ u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
+ u32 timeout_ms = q->sched_props.job_timeout_ms;
+ u32 diff;
+
+ if (ctx_timestamp < ctx_job_timestamp)
+ diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
+ else
+ diff = ctx_timestamp - ctx_job_timestamp;
+
+ /*
+ * Ensure timeout is within 5% to account for an GuC scheduling latency
+ */
+ return ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff)) >=
+ timeout_ms;
+}
+
+static void enable_scheduling(struct xe_exec_queue *q)
+{
+ struct xe_guc *guc = exec_queue_to_guc(q);
+ MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
+
+ set_exec_queue_pending_enable(q);
+ set_exec_queue_enabled(q);
+ trace_xe_exec_queue_scheduling_enable(q);
+
+ xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
+ G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
+}
+
static enum drm_gpu_sched_stat
guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
{
@@ -928,7 +980,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
int err = -ETIME;
int i = 0;
- bool wedged;
+ bool wedged, skip_timeout_check = exec_queue_reset(q) ||
+ exec_queue_killed_or_banned_or_wedged(q);
/*
* TDR has fired before free job worker. Common if exec queue
@@ -940,38 +993,22 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
return DRM_GPU_SCHED_STAT_NOMINAL;
}
- drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
- xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
- q->guc->id, q->flags);
- xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
- "Kernel-submitted job timed out\n");
- xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
- "VM job timed out on non-killed execqueue\n");
-
- if (!exec_queue_killed(q))
- xe_devcoredump(job);
-
- trace_xe_sched_job_timedout(job);
+ /* Job hasn't started, can't be timed out */
+ if (!skip_timeout_check && !xe_sched_job_started(job))
+ goto rearm;
+ /*
+ * XXX: Sampling timeout doesn't work in wedged mode as we have to
+ * modify scheduling state to read timestamp. We could read the
+ * timestamp from a register to accumulate current running time but this
+ * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
+ * genuine timeouts.
+ */
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
/* Kill the run_job entry point */
xe_sched_submission_stop(sched);
- /*
- * Kernel jobs should never fail, nor should VM jobs if they do
- * somethings has gone wrong and the GT needs a reset
- */
- if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
- (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
- if (!xe_sched_invalidate_job(job, 2)) {
- xe_sched_add_pending_job(sched, job);
- xe_sched_submission_start(sched);
- xe_gt_reset_async(q->gt);
- goto out;
- }
- }
-
/* Engine state now stable, disable scheduling if needed */
if (!wedged && exec_queue_registered(q)) {
struct xe_guc *guc = exec_queue_to_guc(q);
@@ -979,7 +1016,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
if (exec_queue_reset(q))
err = -EIO;
- set_exec_queue_banned(q);
+ set_exec_queue_check_timeout(q);
if (!exec_queue_destroyed(q)) {
xe_exec_queue_get(q);
disable_scheduling_deregister(guc, q);
@@ -999,6 +1036,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
guc_read_stopped(guc), HZ * 5);
if (!ret || guc_read_stopped(guc)) {
drm_warn(&xe->drm, "Schedule disable failed to respond");
+ clear_exec_queue_check_timeout(q);
+ set_exec_queue_banned(q);
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
xe_gt_reset_async(q->gt);
@@ -1007,6 +1046,38 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
}
}
+ if (!skip_timeout_check && !check_timeout(q) &&
+ !exec_queue_reset(q)) {
+ clear_exec_queue_check_timeout(q);
+ goto sched_enable;
+ }
+
+ drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
+ xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
+ q->guc->id, q->flags);
+ xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
+ "Kernel-submitted job timed out\n");
+ xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
+ "VM job timed out on non-killed execqueue\n");
+
+ trace_xe_sched_job_timedout(job);
+
+ /*
+ * Kernel jobs should never fail, nor should VM jobs if they do
+ * somethings has gone wrong and the GT needs a reset
+ */
+ if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
+ (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
+ if (!xe_sched_invalidate_job(job, 2)) {
+ xe_gt_reset_async(q->gt);
+ goto rearm;
+ }
+ }
+
+ set_exec_queue_banned(q);
+ if (!exec_queue_killed(q))
+ xe_devcoredump(job);
+
/* Stop fence signaling */
xe_hw_fence_irq_stop(q->fence_irq);
@@ -1030,6 +1101,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
out:
return DRM_GPU_SCHED_STAT_NOMINAL;
+
+sched_enable:
+ enable_scheduling(q);
+rearm:
+ /*
+ * XXX: Ideally want to adjust timeout based on current exection time
+ * but there is not currently an easy way to do in DRM scheduler. With
+ * some thought, do this in a follow up.
+ */
+ xe_sched_add_pending_job(sched, job);
+ xe_sched_submission_start(sched);
+
+ return DRM_GPU_SCHED_STAT_NOMINAL;
}
static void __guc_exec_queue_fini_async(struct work_struct *w)
@@ -1432,7 +1516,8 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
/* Clean up lost G2H + reset engine state */
if (exec_queue_registered(q)) {
- if ((exec_queue_banned(q) && exec_queue_destroyed(q)) ||
+ if (((exec_queue_banned(q) || exec_queue_check_timeout(q))
+ && exec_queue_destroyed(q)) ||
xe_exec_queue_is_lr(q))
xe_exec_queue_put(q);
else if (exec_queue_destroyed(q))
@@ -1604,7 +1689,8 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q)
if (q->guc->suspend_pending) {
suspend_fence_signal(q);
} else {
- if (exec_queue_banned(q)) {
+ if (exec_queue_banned(q) ||
+ exec_queue_check_timeout(q)) {
smp_wmb();
wake_up_all(&guc->ct.wq);
}
@@ -1646,7 +1732,8 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
clear_exec_queue_registered(q);
- if (exec_queue_banned(q) || xe_exec_queue_is_lr(q))
+ if (exec_queue_banned(q) || exec_queue_check_timeout(q) ||
+ xe_exec_queue_is_lr(q))
xe_exec_queue_put(q);
else
__guc_exec_queue_fini(guc, q);
@@ -1709,7 +1796,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
* guc_exec_queue_timedout_job.
*/
set_exec_queue_reset(q);
- if (!exec_queue_banned(q))
+ if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
xe_guc_exec_queue_trigger_cleanup(q);
return 0;
@@ -1739,7 +1826,7 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
/* Treat the same as engine reset */
set_exec_queue_reset(q);
- if (!exec_queue_banned(q))
+ if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
xe_guc_exec_queue_trigger_cleanup(q);
return 0;
--
2.34.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* ✗ CI.Patch_applied: failure for Only timeout jobs if they run longer than timeout period (rev2)
2024-06-07 22:03 [PATCH v2 0/6] Only timeout jobs if they run longer than timeout period Matthew Brost
` (5 preceding siblings ...)
2024-06-07 22:03 ` [PATCH v2 6/6] drm/xe: Sample ctx timestamp to determine if jobs have timed out Matthew Brost
@ 2024-06-07 23:27 ` Patchwork
6 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2024-06-07 23:27 UTC (permalink / raw)
To: Matthew Brost; +Cc: intel-xe
== Series Details ==
Series: Only timeout jobs if they run longer than timeout period (rev2)
URL : https://patchwork.freedesktop.org/series/134593/
State : failure
== Summary ==
=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 2bea08bd3129 drm-tip: 2024y-06m-07d-19h-26m-01s UTC integration manifest
=== git am output follows ===
error: patch failed: drivers/gpu/drm/xe/xe_guc_submit.c:61
error: drivers/gpu/drm/xe/xe_guc_submit.c: patch does not apply
hint: Use 'git am --show-current-patch=diff' to see the failed patch
Applying: drm/xe: Add LRC ctx timestamp support functions
Applying: drm/xe: Add MI_COPY_MEM_MEM GPU instruction definitions
Applying: drm/xe: Emit ctx timestamp copy in ring ops
Applying: drm/xe: Add ctx timestamp to LRC snapshot
Applying: drm/xe: Add xe_gt_clock_interval_to_ms helper
Applying: drm/xe: Sample ctx timestamp to determine if jobs have timed out
Patch failed at 0006 drm/xe: Sample ctx timestamp to determine if jobs have timed out
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".
^ permalink raw reply [flat|nested] 8+ messages in thread