* [linux-next:master 12502/12880] drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: sparse: incorrect type in argument 1 (different address spaces)
@ 2024-07-13 2:53 kernel test robot
0 siblings, 0 replies; only message in thread
From: kernel test robot @ 2024-07-13 2:53 UTC (permalink / raw)
To: José Roberto de Souza
Cc: oe-kbuild-all, Linux Memory Management List, Matthew Brost
tree: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
head: 3fe121b622825ff8cc995a1e6b026181c48188db
commit: f6ca930d974e473fd608fc9aa1759fbe731fe44d [12502/12880] drm/xe: Add process name and PID to job timedout message
config: x86_64-randconfig-123-20240713 (https://download.01.org/0day-ci/archive/20240713/202407131049.PQilkG1A-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240713/202407131049.PQilkG1A-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407131049.PQilkG1A-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
>> drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct pid *pid @@ got struct pid [noderef] __rcu *pid @@
drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: expected struct pid *pid
drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: got struct pid [noderef] __rcu *pid
vim +1164 drivers/gpu/drm/xe/xe_guc_submit.c
1054
1055 static enum drm_gpu_sched_stat
1056 guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
1057 {
1058 struct xe_sched_job *job = to_xe_sched_job(drm_job);
1059 struct xe_sched_job *tmp_job;
1060 struct xe_exec_queue *q = job->q;
1061 struct xe_gpu_scheduler *sched = &q->guc->sched;
1062 struct xe_guc *guc = exec_queue_to_guc(q);
1063 const char *process_name = "no process";
1064 struct task_struct *task = NULL;
1065 int err = -ETIME;
1066 pid_t pid = -1;
1067 int i = 0;
1068 bool wedged, skip_timeout_check;
1069
1070 /*
1071 * TDR has fired before free job worker. Common if exec queue
1072 * immediately closed after last fence signaled.
1073 */
1074 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
1075 guc_exec_queue_free_job(drm_job);
1076
1077 return DRM_GPU_SCHED_STAT_NOMINAL;
1078 }
1079
1080 /* Kill the run_job entry point */
1081 xe_sched_submission_stop(sched);
1082
1083 /* Must check all state after stopping scheduler */
1084 skip_timeout_check = exec_queue_reset(q) ||
1085 exec_queue_killed_or_banned_or_wedged(q) ||
1086 exec_queue_destroyed(q);
1087
1088 /* Job hasn't started, can't be timed out */
1089 if (!skip_timeout_check && !xe_sched_job_started(job))
1090 goto rearm;
1091
1092 /*
1093 * XXX: Sampling timeout doesn't work in wedged mode as we have to
1094 * modify scheduling state to read timestamp. We could read the
1095 * timestamp from a register to accumulate current running time but this
1096 * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
1097 * genuine timeouts.
1098 */
1099 wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
1100
1101 /* Engine state now stable, disable scheduling to check timestamp */
1102 if (!wedged && exec_queue_registered(q)) {
1103 int ret;
1104
1105 if (exec_queue_reset(q))
1106 err = -EIO;
1107
1108 if (!exec_queue_destroyed(q)) {
1109 /*
1110 * Wait for any pending G2H to flush out before
1111 * modifying state
1112 */
1113 ret = wait_event_timeout(guc->ct.wq,
1114 !exec_queue_pending_enable(q) ||
1115 guc_read_stopped(guc), HZ * 5);
1116 if (!ret || guc_read_stopped(guc))
1117 goto trigger_reset;
1118
1119 /*
1120 * Flag communicates to G2H handler that schedule
1121 * disable originated from a timeout check. The G2H then
1122 * avoid triggering cleanup or deregistering the exec
1123 * queue.
1124 */
1125 set_exec_queue_check_timeout(q);
1126 disable_scheduling(q, skip_timeout_check);
1127 }
1128
1129 /*
1130 * Must wait for scheduling to be disabled before signalling
1131 * any fences, if GT broken the GT reset code should signal us.
1132 *
1133 * FIXME: Tests can generate a ton of 0x6000 (IOMMU CAT fault
1134 * error) messages which can cause the schedule disable to get
1135 * lost. If this occurs, trigger a GT reset to recover.
1136 */
1137 smp_rmb();
1138 ret = wait_event_timeout(guc->ct.wq,
1139 !exec_queue_pending_disable(q) ||
1140 guc_read_stopped(guc), HZ * 5);
1141 if (!ret || guc_read_stopped(guc)) {
1142 trigger_reset:
1143 if (!ret)
1144 xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
1145 set_exec_queue_extra_ref(q);
1146 xe_exec_queue_get(q); /* GT reset owns this */
1147 set_exec_queue_banned(q);
1148 xe_gt_reset_async(q->gt);
1149 xe_sched_tdr_queue_imm(sched);
1150 goto rearm;
1151 }
1152 }
1153
1154 /*
1155 * Check if job is actually timed out, if so restart job execution and TDR
1156 */
1157 if (!wedged && !skip_timeout_check && !check_timeout(q, job) &&
1158 !exec_queue_reset(q) && exec_queue_registered(q)) {
1159 clear_exec_queue_check_timeout(q);
1160 goto sched_enable;
1161 }
1162
1163 if (q->vm && q->vm->xef) {
> 1164 task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
1165 if (task) {
1166 process_name = task->comm;
1167 pid = task->pid;
1168 }
1169 }
1170 xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
1171 xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
1172 q->guc->id, q->flags, process_name, pid);
1173 if (task)
1174 put_task_struct(task);
1175
1176 trace_xe_sched_job_timedout(job);
1177
1178 if (!exec_queue_killed(q))
1179 xe_devcoredump(job);
1180
1181 /*
1182 * Kernel jobs should never fail, nor should VM jobs if they do
1183 * somethings has gone wrong and the GT needs a reset
1184 */
1185 xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
1186 "Kernel-submitted job timed out\n");
1187 xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
1188 "VM job timed out on non-killed execqueue\n");
1189 if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
1190 (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
1191 if (!xe_sched_invalidate_job(job, 2)) {
1192 clear_exec_queue_check_timeout(q);
1193 xe_gt_reset_async(q->gt);
1194 goto rearm;
1195 }
1196 }
1197
1198 /* Finish cleaning up exec queue via deregister */
1199 set_exec_queue_banned(q);
1200 if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
1201 set_exec_queue_extra_ref(q);
1202 xe_exec_queue_get(q);
1203 __deregister_exec_queue(guc, q);
1204 }
1205
1206 /* Stop fence signaling */
1207 xe_hw_fence_irq_stop(q->fence_irq);
1208
1209 /*
1210 * Fence state now stable, stop / start scheduler which cleans up any
1211 * fences that are complete
1212 */
1213 xe_sched_add_pending_job(sched, job);
1214 xe_sched_submission_start(sched);
1215
1216 xe_guc_exec_queue_trigger_cleanup(q);
1217
1218 /* Mark all outstanding jobs as bad, thus completing them */
1219 spin_lock(&sched->base.job_list_lock);
1220 list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list)
1221 xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED);
1222 spin_unlock(&sched->base.job_list_lock);
1223
1224 /* Start fence signaling */
1225 xe_hw_fence_irq_start(q->fence_irq);
1226
1227 return DRM_GPU_SCHED_STAT_NOMINAL;
1228
1229 sched_enable:
1230 enable_scheduling(q);
1231 rearm:
1232 /*
1233 * XXX: Ideally want to adjust timeout based on current exection time
1234 * but there is not currently an easy way to do in DRM scheduler. With
1235 * some thought, do this in a follow up.
1236 */
1237 xe_sched_add_pending_job(sched, job);
1238 xe_sched_submission_start(sched);
1239
1240 return DRM_GPU_SCHED_STAT_NOMINAL;
1241 }
1242
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-07-13 2:54 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-13 2:53 [linux-next:master 12502/12880] drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: sparse: incorrect type in argument 1 (different address spaces) kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).