* [PATCH 1/2] drm/scheduler: add a current job field to scheduler @ 2018-09-18 16:17 Nayan Deshmukh 2018-09-18 16:17 ` [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job Nayan Deshmukh 2018-09-18 17:09 ` [PATCH 1/2] drm/scheduler: add a current job field to scheduler Koenig, Christian 0 siblings, 2 replies; 9+ messages in thread From: Nayan Deshmukh @ 2018-09-18 16:17 UTC (permalink / raw) To: dri-devel; +Cc: Nayan Deshmukh, christian.koenig Which points to the job running on the hardware. This is useful when we need to access the currently executing job from the scheduler. Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> --- drivers/gpu/drm/scheduler/sched_main.c | 17 +++++++++++------ include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 9ca741f3a0bc..0e6ccc8243db 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -189,6 +189,7 @@ static void drm_sched_job_finish(struct work_struct *work) struct drm_sched_job *s_job = container_of(work, struct drm_sched_job, finish_work); struct drm_gpu_scheduler *sched = s_job->sched; + struct drm_sched_job *next; /* * Canceling the timeout without removing our job from the ring mirror @@ -201,10 +202,10 @@ static void drm_sched_job_finish(struct work_struct *work) spin_lock(&sched->job_list_lock); /* queue TDR for next job */ + next = list_next_entry(s_job, node); + sched->curr_job = next; if (sched->timeout != MAX_SCHEDULE_TIMEOUT && !list_is_last(&s_job->node, &sched->ring_mirror_list)) { - struct drm_sched_job *next = list_next_entry(s_job, node); - if (!dma_fence_is_signaled(&next->s_fence->finished)) schedule_delayed_work(&next->work_tdr, sched->timeout); } @@ -233,10 +234,12 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) spin_lock(&sched->job_list_lock); list_add_tail(&s_job->node, &sched->ring_mirror_list); - if (sched->timeout != MAX_SCHEDULE_TIMEOUT && - list_first_entry_or_null(&sched->ring_mirror_list, - struct drm_sched_job, node) == s_job) - schedule_delayed_work(&s_job->work_tdr, sched->timeout); + if (list_first_entry_or_null(&sched->ring_mirror_list, + struct drm_sched_job, node) == s_job) { + if (sched->timeout != MAX_SCHEDULE_TIMEOUT) + schedule_delayed_work(&s_job->work_tdr, sched->timeout); + sched->curr_job = s_job; + } spin_unlock(&sched->job_list_lock); } @@ -316,6 +319,8 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) struct drm_sched_job, node); if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) schedule_delayed_work(&s_job->work_tdr, sched->timeout); + if (s_job) + sched->curr_job = s_job; list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct drm_sched_fence *s_fence = s_job->s_fence; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index daec50f887b3..07e776b1ca42 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -252,6 +252,7 @@ struct drm_sched_backend_ops { * @timeout: the time after which a job is removed from the scheduler. * @name: name of the ring for which this scheduler is being used. * @sched_rq: priority wise array of run queues. + * @curr_job: points to the job currently running on the hardware * @wake_up_worker: the wait queue on which the scheduler sleeps until a job * is ready to be scheduled. * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler @@ -274,6 +275,7 @@ struct drm_gpu_scheduler { long timeout; const char *name; struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_MAX]; + struct drm_sched_job *curr_job; wait_queue_head_t wake_up_worker; wait_queue_head_t job_scheduled; atomic_t hw_rq_count; -- 2.14.3 _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-18 16:17 [PATCH 1/2] drm/scheduler: add a current job field to scheduler Nayan Deshmukh @ 2018-09-18 16:17 ` Nayan Deshmukh 2018-09-19 12:30 ` Christian König 2018-09-18 17:09 ` [PATCH 1/2] drm/scheduler: add a current job field to scheduler Koenig, Christian 1 sibling, 1 reply; 9+ messages in thread From: Nayan Deshmukh @ 2018-09-18 16:17 UTC (permalink / raw) To: dri-devel; +Cc: Nayan Deshmukh, christian.koenig having a delayed work item per job is redundant as we only need one per scheduler to track the time out the currently executing job. Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> Suggested-by: Christian König <christian.koenig@amd.com> --- drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++------- include/drm/gpu_scheduler.h | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 0e6ccc8243db..f213b5c7f718 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work) * manages to find this job as the next job in the list, the fence * signaled check below will prevent the timeout to be restarted. */ - cancel_delayed_work_sync(&s_job->work_tdr); + cancel_delayed_work_sync(&sched->work_tdr); spin_lock(&sched->job_list_lock); /* queue TDR for next job */ @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work) if (sched->timeout != MAX_SCHEDULE_TIMEOUT && !list_is_last(&s_job->node, &sched->ring_mirror_list)) { if (!dma_fence_is_signaled(&next->s_fence->finished)) - schedule_delayed_work(&next->work_tdr, sched->timeout); + schedule_delayed_work(&sched->work_tdr, sched->timeout); } /* remove job from ring_mirror_list */ list_del(&s_job->node); @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) if (list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node) == s_job) { if (sched->timeout != MAX_SCHEDULE_TIMEOUT) - schedule_delayed_work(&s_job->work_tdr, sched->timeout); + schedule_delayed_work(&sched->work_tdr, sched->timeout); sched->curr_job = s_job; } spin_unlock(&sched->job_list_lock); @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) static void drm_sched_job_timedout(struct work_struct *work) { - struct drm_sched_job *job = container_of(work, struct drm_sched_job, - work_tdr.work); + struct drm_gpu_scheduler *sched = container_of(work, + struct drm_gpu_scheduler, + work_tdr.work); + struct drm_sched_job *job = sched->curr_job; job->sched->ops->timedout_job(job); } @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) s_job = list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node); if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) - schedule_delayed_work(&s_job->work_tdr, sched->timeout); + schedule_delayed_work(&sched->work_tdr, sched->timeout); if (s_job) sched->curr_job = s_job; @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job, INIT_WORK(&job->finish_work, drm_sched_job_finish); INIT_LIST_HEAD(&job->node); - INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout); return 0; } @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, INIT_LIST_HEAD(&sched->ring_mirror_list); spin_lock_init(&sched->job_list_lock); atomic_set(&sched->hw_rq_count, 0); + INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); atomic_set(&sched->num_jobs, 0); atomic64_set(&sched->job_id_count, 0); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 07e776b1ca42..9d50d7f3eaa4 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * finished to remove the job from the * @drm_gpu_scheduler.ring_mirror_list. * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list. - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout - * interval is over. * @id: a unique id assigned to each job scheduled on the scheduler. * @karma: increment on every hang caused by this job. If this exceeds the hang * limit of the scheduler then the job is marked guilty and will not @@ -195,7 +193,6 @@ struct drm_sched_job { struct dma_fence_cb finish_cb; struct work_struct finish_work; struct list_head node; - struct delayed_work work_tdr; uint64_t id; atomic_t karma; enum drm_sched_priority s_priority; @@ -260,6 +257,8 @@ struct drm_sched_backend_ops { * finished. * @hw_rq_count: the number of jobs currently in the hardware queue. * @job_id_count: used to assign unique id to the each job. + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the + * timeout interval is over. * @thread: the kthread on which the scheduler which run. * @ring_mirror_list: the list of jobs which are currently in the job queue. * @job_list_lock: lock to protect the ring_mirror_list. @@ -280,6 +279,7 @@ struct drm_gpu_scheduler { wait_queue_head_t job_scheduled; atomic_t hw_rq_count; atomic64_t job_id_count; + struct delayed_work work_tdr; struct task_struct *thread; struct list_head ring_mirror_list; spinlock_t job_list_lock; -- 2.14.3 _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-18 16:17 ` [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job Nayan Deshmukh @ 2018-09-19 12:30 ` Christian König 2018-09-19 15:39 ` Michel Dänzer 2018-09-20 11:25 ` Nayan Deshmukh 0 siblings, 2 replies; 9+ messages in thread From: Christian König @ 2018-09-19 12:30 UTC (permalink / raw) To: Nayan Deshmukh, dri-devel Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: > having a delayed work item per job is redundant as we only need one > per scheduler to track the time out the currently executing job. Well that looks simpler than I thought it would be. But it shows the next problem that the timeout and the completion could race. As far as I can see that can be fixed by moving the dma_fence_remove_callback()/dma_fence_add_callback() dance from drm_sched_hw_job_reset() to drm_sched_job_timedout(). Anyway, I would say drop patch #1 and fix the one comment below and we can use this. > > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> > Suggested-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++------- > include/drm/gpu_scheduler.h | 6 +++--- > 2 files changed, 12 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 0e6ccc8243db..f213b5c7f718 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work) > * manages to find this job as the next job in the list, the fence > * signaled check below will prevent the timeout to be restarted. > */ > - cancel_delayed_work_sync(&s_job->work_tdr); > + cancel_delayed_work_sync(&sched->work_tdr); > > spin_lock(&sched->job_list_lock); > /* queue TDR for next job */ > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work) > if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > !list_is_last(&s_job->node, &sched->ring_mirror_list)) { > if (!dma_fence_is_signaled(&next->s_fence->finished)) Since we now have only one delayed work item we can just drop the test if next is already signaled. Regards, Christian. > - schedule_delayed_work(&next->work_tdr, sched->timeout); > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > } > /* remove job from ring_mirror_list */ > list_del(&s_job->node); > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) > if (list_first_entry_or_null(&sched->ring_mirror_list, > struct drm_sched_job, node) == s_job) { > if (sched->timeout != MAX_SCHEDULE_TIMEOUT) > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > sched->curr_job = s_job; > } > spin_unlock(&sched->job_list_lock); > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) > > static void drm_sched_job_timedout(struct work_struct *work) > { > - struct drm_sched_job *job = container_of(work, struct drm_sched_job, > - work_tdr.work); > + struct drm_gpu_scheduler *sched = container_of(work, > + struct drm_gpu_scheduler, > + work_tdr.work); > + struct drm_sched_job *job = sched->curr_job; > > job->sched->ops->timedout_job(job); > } > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) > s_job = list_first_entry_or_null(&sched->ring_mirror_list, > struct drm_sched_job, node); > if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > if (s_job) > sched->curr_job = s_job; > > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job, > > INIT_WORK(&job->finish_work, drm_sched_job_finish); > INIT_LIST_HEAD(&job->node); > - INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout); > > return 0; > } > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, > INIT_LIST_HEAD(&sched->ring_mirror_list); > spin_lock_init(&sched->job_list_lock); > atomic_set(&sched->hw_rq_count, 0); > + INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); > atomic_set(&sched->num_jobs, 0); > atomic64_set(&sched->job_id_count, 0); > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 07e776b1ca42..9d50d7f3eaa4 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); > * finished to remove the job from the > * @drm_gpu_scheduler.ring_mirror_list. > * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list. > - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout > - * interval is over. > * @id: a unique id assigned to each job scheduled on the scheduler. > * @karma: increment on every hang caused by this job. If this exceeds the hang > * limit of the scheduler then the job is marked guilty and will not > @@ -195,7 +193,6 @@ struct drm_sched_job { > struct dma_fence_cb finish_cb; > struct work_struct finish_work; > struct list_head node; > - struct delayed_work work_tdr; > uint64_t id; > atomic_t karma; > enum drm_sched_priority s_priority; > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops { > * finished. > * @hw_rq_count: the number of jobs currently in the hardware queue. > * @job_id_count: used to assign unique id to the each job. > + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the > + * timeout interval is over. > * @thread: the kthread on which the scheduler which run. > * @ring_mirror_list: the list of jobs which are currently in the job queue. > * @job_list_lock: lock to protect the ring_mirror_list. > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler { > wait_queue_head_t job_scheduled; > atomic_t hw_rq_count; > atomic64_t job_id_count; > + struct delayed_work work_tdr; > struct task_struct *thread; > struct list_head ring_mirror_list; > spinlock_t job_list_lock; _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-19 12:30 ` Christian König @ 2018-09-19 15:39 ` Michel Dänzer 2018-09-20 6:41 ` Christian König 2018-09-20 11:25 ` Nayan Deshmukh 1 sibling, 1 reply; 9+ messages in thread From: Michel Dänzer @ 2018-09-19 15:39 UTC (permalink / raw) To: Christian König, Nayan Deshmukh; +Cc: dri-devel On 2018-09-19 2:30 p.m., Christian König wrote: > Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: >> having a delayed work item per job is redundant as we only need one >> per scheduler to track the time out the currently executing job. > > Well that looks simpler than I thought it would be. > > But it shows the next problem that the timeout and the completion could > race. > > As far as I can see that can be fixed by moving the > dma_fence_remove_callback()/dma_fence_add_callback() dance from > drm_sched_hw_job_reset() to drm_sched_job_timedout(). BTW, while you guys are looking into this code, please keep an eye open for things that could explain https://bugs.freedesktop.org/107762 . -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Mesa and X developer _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-19 15:39 ` Michel Dänzer @ 2018-09-20 6:41 ` Christian König 0 siblings, 0 replies; 9+ messages in thread From: Christian König @ 2018-09-20 6:41 UTC (permalink / raw) To: Michel Dänzer, Christian König, Nayan Deshmukh; +Cc: dri-devel Am 19.09.2018 um 17:39 schrieb Michel Dänzer: > On 2018-09-19 2:30 p.m., Christian König wrote: >> Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: >>> having a delayed work item per job is redundant as we only need one >>> per scheduler to track the time out the currently executing job. >> Well that looks simpler than I thought it would be. >> >> But it shows the next problem that the timeout and the completion could >> race. >> >> As far as I can see that can be fixed by moving the >> dma_fence_remove_callback()/dma_fence_add_callback() dance from >> drm_sched_hw_job_reset() to drm_sched_job_timedout(). > BTW, while you guys are looking into this code, please keep an eye open > for things that could explain https://bugs.freedesktop.org/107762 . Yeah, since we now have only one timer that should be fixed by this as well. Christian. _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-19 12:30 ` Christian König 2018-09-19 15:39 ` Michel Dänzer @ 2018-09-20 11:25 ` Nayan Deshmukh 2018-09-20 11:30 ` Christian König 1 sibling, 1 reply; 9+ messages in thread From: Nayan Deshmukh @ 2018-09-20 11:25 UTC (permalink / raw) To: Christian König; +Cc: Maling list - DRI developers [-- Attachment #1.1: Type: text/plain, Size: 7287 bytes --] On Wed, Sep 19, 2018, 9:31 PM Christian König <christian.koenig@amd.com> wrote: > Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: > > having a delayed work item per job is redundant as we only need one > > per scheduler to track the time out the currently executing job. > > Well that looks simpler than I thought it would be. > > But it shows the next problem that the timeout and the completion could > race. > > As far as I can see that can be fixed by moving the > dma_fence_remove_callback()/dma_fence_add_callback() dance from > drm_sched_hw_job_reset() to drm_sched_job_timedout(). > > Anyway, I would say drop patch #1 and fix the one comment below and we > can use this. > > > > > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> > > Suggested-by: Christian König <christian.koenig@amd.com> > > --- > > drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++------- > > include/drm/gpu_scheduler.h | 6 +++--- > > 2 files changed, 12 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > b/drivers/gpu/drm/scheduler/sched_main.c > > index 0e6ccc8243db..f213b5c7f718 100644 > > --- a/drivers/gpu/drm/scheduler/sched_main.c > > +++ b/drivers/gpu/drm/scheduler/sched_main.c > > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct > *work) > > * manages to find this job as the next job in the list, the fence > > * signaled check below will prevent the timeout to be restarted. > > */ > > - cancel_delayed_work_sync(&s_job->work_tdr); > > + cancel_delayed_work_sync(&sched->work_tdr); > > > > spin_lock(&sched->job_list_lock); > > /* queue TDR for next job */ > > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct > *work) > > if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > > !list_is_last(&s_job->node, &sched->ring_mirror_list)) { > > if (!dma_fence_is_signaled(&next->s_fence->finished)) > > Since we now have only one delayed work item we can just drop the test > if next is already signaled. > Can you elaborate more on this. Which test are you talking about? Regards, Nayan > > > Regards, > Christian. > > > - schedule_delayed_work(&next->work_tdr, > sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, > sched->timeout); > > } > > /* remove job from ring_mirror_list */ > > list_del(&s_job->node); > > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job > *s_job) > > if (list_first_entry_or_null(&sched->ring_mirror_list, > > struct drm_sched_job, node) == s_job) { > > if (sched->timeout != MAX_SCHEDULE_TIMEOUT) > > - schedule_delayed_work(&s_job->work_tdr, > sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, > sched->timeout); > > sched->curr_job = s_job; > > } > > spin_unlock(&sched->job_list_lock); > > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct > drm_sched_job *s_job) > > > > static void drm_sched_job_timedout(struct work_struct *work) > > { > > - struct drm_sched_job *job = container_of(work, struct > drm_sched_job, > > - work_tdr.work); > > + struct drm_gpu_scheduler *sched = container_of(work, > > + struct drm_gpu_scheduler, > > + work_tdr.work); > > + struct drm_sched_job *job = sched->curr_job; > > > > job->sched->ops->timedout_job(job); > > } > > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler > *sched) > > s_job = list_first_entry_or_null(&sched->ring_mirror_list, > > struct drm_sched_job, node); > > if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) > > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > > if (s_job) > > sched->curr_job = s_job; > > > > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job, > > > > INIT_WORK(&job->finish_work, drm_sched_job_finish); > > INIT_LIST_HEAD(&job->node); > > - INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout); > > > > return 0; > > } > > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, > > INIT_LIST_HEAD(&sched->ring_mirror_list); > > spin_lock_init(&sched->job_list_lock); > > atomic_set(&sched->hw_rq_count, 0); > > + INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); > > atomic_set(&sched->num_jobs, 0); > > atomic64_set(&sched->job_id_count, 0); > > > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > > index 07e776b1ca42..9d50d7f3eaa4 100644 > > --- a/include/drm/gpu_scheduler.h > > +++ b/include/drm/gpu_scheduler.h > > @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct > dma_fence *f); > > * finished to remove the job from the > > * @drm_gpu_scheduler.ring_mirror_list. > > * @node: used to append this struct to the > @drm_gpu_scheduler.ring_mirror_list. > > - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after > the timeout > > - * interval is over. > > * @id: a unique id assigned to each job scheduled on the scheduler. > > * @karma: increment on every hang caused by this job. If this exceeds > the hang > > * limit of the scheduler then the job is marked guilty and > will not > > @@ -195,7 +193,6 @@ struct drm_sched_job { > > struct dma_fence_cb finish_cb; > > struct work_struct finish_work; > > struct list_head node; > > - struct delayed_work work_tdr; > > uint64_t id; > > atomic_t karma; > > enum drm_sched_priority s_priority; > > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops { > > * finished. > > * @hw_rq_count: the number of jobs currently in the hardware queue. > > * @job_id_count: used to assign unique id to the each job. > > + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after > the > > + * timeout interval is over. > > * @thread: the kthread on which the scheduler which run. > > * @ring_mirror_list: the list of jobs which are currently in the job > queue. > > * @job_list_lock: lock to protect the ring_mirror_list. > > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler { > > wait_queue_head_t job_scheduled; > > atomic_t hw_rq_count; > > atomic64_t job_id_count; > > + struct delayed_work work_tdr; > > struct task_struct *thread; > > struct list_head ring_mirror_list; > > spinlock_t job_list_lock; > > [-- Attachment #1.2: Type: text/html, Size: 9839 bytes --] [-- Attachment #2: Type: text/plain, Size: 160 bytes --] _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job 2018-09-20 11:25 ` Nayan Deshmukh @ 2018-09-20 11:30 ` Christian König 0 siblings, 0 replies; 9+ messages in thread From: Christian König @ 2018-09-20 11:30 UTC (permalink / raw) To: Nayan Deshmukh; +Cc: Maling list - DRI developers [-- Attachment #1.1: Type: text/plain, Size: 8411 bytes --] Am 20.09.2018 um 13:25 schrieb Nayan Deshmukh: > > > On Wed, Sep 19, 2018, 9:31 PM Christian König > <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> wrote: > > Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: > > having a delayed work item per job is redundant as we only need one > > per scheduler to track the time out the currently executing job. > > Well that looks simpler than I thought it would be. > > But it shows the next problem that the timeout and the completion > could > race. > > As far as I can see that can be fixed by moving the > dma_fence_remove_callback()/dma_fence_add_callback() dance from > drm_sched_hw_job_reset() to drm_sched_job_timedout(). > > Anyway, I would say drop patch #1 and fix the one comment below > and we > can use this. > > > > > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com > <mailto:nayan26deshmukh@gmail.com>> > > Suggested-by: Christian König <christian.koenig@amd.com > <mailto:christian.koenig@amd.com>> > > --- > > drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++------- > > include/drm/gpu_scheduler.h | 6 +++--- > > 2 files changed, 12 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > b/drivers/gpu/drm/scheduler/sched_main.c > > index 0e6ccc8243db..f213b5c7f718 100644 > > --- a/drivers/gpu/drm/scheduler/sched_main.c > > +++ b/drivers/gpu/drm/scheduler/sched_main.c > > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct > work_struct *work) > > * manages to find this job as the next job in the list, > the fence > > * signaled check below will prevent the timeout to be > restarted. > > */ > > - cancel_delayed_work_sync(&s_job->work_tdr); > > + cancel_delayed_work_sync(&sched->work_tdr); > > > > spin_lock(&sched->job_list_lock); > > /* queue TDR for next job */ > > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct > work_struct *work) > > if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > > !list_is_last(&s_job->node, &sched->ring_mirror_list)) { > > if (!dma_fence_is_signaled(&next->s_fence->finished)) > > Since we now have only one delayed work item we can just drop the > test > if next is already signaled. > > Can you elaborate more on this. Which test are you talking about? I was talking about the "!dma_fence_is_signaled()" test here. > > Regards, > Nayan > > > > Regards, > Christian. > > > - schedule_delayed_work(&next->work_tdr, sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > > } > > /* remove job from ring_mirror_list */ > > list_del(&s_job->node); > Basically you could do this first and then you need to only test if sched->ring_mirror_list is empty. Regards, Christian. > > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct > drm_sched_job *s_job) > > if (list_first_entry_or_null(&sched->ring_mirror_list, > > struct drm_sched_job, node) == > s_job) { > > if (sched->timeout != MAX_SCHEDULE_TIMEOUT) > > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > > sched->curr_job = s_job; > > } > > spin_unlock(&sched->job_list_lock); > > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct > drm_sched_job *s_job) > > > > static void drm_sched_job_timedout(struct work_struct *work) > > { > > - struct drm_sched_job *job = container_of(work, struct > drm_sched_job, > > - work_tdr.work <http://work_tdr.work>); > > + struct drm_gpu_scheduler *sched = container_of(work, > > + struct > drm_gpu_scheduler, > > + work_tdr.work <http://work_tdr.work>); > > + struct drm_sched_job *job = sched->curr_job; > > > > job->sched->ops->timedout_job(job); > > } > > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct > drm_gpu_scheduler *sched) > > s_job = list_first_entry_or_null(&sched->ring_mirror_list, > > struct drm_sched_job, node); > > if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) > > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > > + schedule_delayed_work(&sched->work_tdr, sched->timeout); > > if (s_job) > > sched->curr_job = s_job; > > > > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job > *job, > > > > INIT_WORK(&job->finish_work, drm_sched_job_finish); > > INIT_LIST_HEAD(&job->node); > > - INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout); > > > > return 0; > > } > > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler > *sched, > > INIT_LIST_HEAD(&sched->ring_mirror_list); > > spin_lock_init(&sched->job_list_lock); > > atomic_set(&sched->hw_rq_count, 0); > > + INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); > > atomic_set(&sched->num_jobs, 0); > > atomic64_set(&sched->job_id_count, 0); > > > > diff --git a/include/drm/gpu_scheduler.h > b/include/drm/gpu_scheduler.h > > index 07e776b1ca42..9d50d7f3eaa4 100644 > > --- a/include/drm/gpu_scheduler.h > > +++ b/include/drm/gpu_scheduler.h > > @@ -175,8 +175,6 @@ struct drm_sched_fence > *to_drm_sched_fence(struct dma_fence *f); > > * finished to remove the job from the > > * @drm_gpu_scheduler.ring_mirror_list. > > * @node: used to append this struct to the > @drm_gpu_scheduler.ring_mirror_list. > > - * @work_tdr: schedules a delayed call to > @drm_sched_job_timedout after the timeout > > - * interval is over. > > * @id: a unique id assigned to each job scheduled on the > scheduler. > > * @karma: increment on every hang caused by this job. If this > exceeds the hang > > * limit of the scheduler then the job is marked > guilty and will not > > @@ -195,7 +193,6 @@ struct drm_sched_job { > > struct dma_fence_cb finish_cb; > > struct work_struct finish_work; > > struct list_head node; > > - struct delayed_work work_tdr; > > uint64_t id; > > atomic_t karma; > > enum drm_sched_priority s_priority; > > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops { > > * finished. > > * @hw_rq_count: the number of jobs currently in the hardware > queue. > > * @job_id_count: used to assign unique id to the each job. > > + * @work_tdr: schedules a delayed call to > @drm_sched_job_timedout after the > > + * timeout interval is over. > > * @thread: the kthread on which the scheduler which run. > > * @ring_mirror_list: the list of jobs which are currently in > the job queue. > > * @job_list_lock: lock to protect the ring_mirror_list. > > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler { > > wait_queue_head_t job_scheduled; > > atomic_t hw_rq_count; > > atomic64_t job_id_count; > > + struct delayed_work work_tdr; > > struct task_struct *thread; > > struct list_head ring_mirror_list; > > spinlock_t job_list_lock; > [-- Attachment #1.2: Type: text/html, Size: 14497 bytes --] [-- Attachment #2: Type: text/plain, Size: 160 bytes --] _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/scheduler: add a current job field to scheduler 2018-09-18 16:17 [PATCH 1/2] drm/scheduler: add a current job field to scheduler Nayan Deshmukh 2018-09-18 16:17 ` [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job Nayan Deshmukh @ 2018-09-18 17:09 ` Koenig, Christian 2018-09-18 23:13 ` Nayan Deshmukh 1 sibling, 1 reply; 9+ messages in thread From: Koenig, Christian @ 2018-09-18 17:09 UTC (permalink / raw) To: Nayan Deshmukh, dri-devel@lists.freedesktop.org Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: > Which points to the job running on the hardware. This is > useful when we need to access the currently executing job > from the scheduler. That should be identical to list_first_entry_or_null(&sched->ring_mirror_list), doesn't it? Regards, Christian. > > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> > ---list_first_entry_or_null(&sched->ring_mirror_list > > drivers/gpu/drm/scheduler/sched_main.c | 17 +++++++++++------ > include/drm/gpu_scheduler.h | 2 ++ > 2 files changed, 13 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 9ca741f3a0bc..0e6ccc8243db 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -189,6 +189,7 @@ static void drm_sched_job_finish(struct work_struct *work) > struct drm_sched_job *s_job = container_of(work, struct drm_sched_job, > finish_work); > struct drm_gpu_scheduler *sched = s_job->sched; > + struct drm_sched_job *next; > > /* > * Canceling the timeout without removing our job from the ring mirror > @@ -201,10 +202,10 @@ static void drm_sched_job_finish(struct work_struct *work) > > spin_lock(&sched->job_list_lock); > /* queue TDR for next job */ > + next = list_next_entry(s_job, node); > + sched->curr_job = next; > if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > !list_is_last(&s_job->node, &sched->ring_mirror_list)) { > - struct drm_sched_job *next = list_next_entry(s_job, node); > - > if (!dma_fence_is_signaled(&next->s_fence->finished)) > schedule_delayed_work(&next->work_tdr, sched->timeout); > } > @@ -233,10 +234,12 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) > > spin_lock(&sched->job_list_lock); > list_add_tail(&s_job->node, &sched->ring_mirror_list); > - if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > - list_first_entry_or_null(&sched->ring_mirror_list, > - struct drm_sched_job, node) == s_job) > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + if (list_first_entry_or_null(&sched->ring_mirror_list, > + struct drm_sched_job, node) == s_job) { > + if (sched->timeout != MAX_SCHEDULE_TIMEOUT) > + schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + sched->curr_job = s_job; > + } > spin_unlock(&sched->job_list_lock); > } > > @@ -316,6 +319,8 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) > struct drm_sched_job, node); > if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) > schedule_delayed_work(&s_job->work_tdr, sched->timeout); > + if (s_job) > + sched->curr_job = s_job; > > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { > struct drm_sched_fence *s_fence = s_job->s_fence; > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index daec50f887b3..07e776b1ca42 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -252,6 +252,7 @@ struct drm_sched_backend_ops { > * @timeout: the time after which a job is removed from the scheduler. > * @name: name of the ring for which this scheduler is being used. > * @sched_rq: priority wise array of run queues. > + * @curr_job: points to the job currently running on the hardware > * @wake_up_worker: the wait queue on which the scheduler sleeps until a job > * is ready to be scheduled. > * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler > @@ -274,6 +275,7 @@ struct drm_gpu_scheduler { > long timeout; > const char *name; > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_MAX]; > + struct drm_sched_job *curr_job; > wait_queue_head_t wake_up_worker; > wait_queue_head_t job_scheduled; > atomic_t hw_rq_count; _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/scheduler: add a current job field to scheduler 2018-09-18 17:09 ` [PATCH 1/2] drm/scheduler: add a current job field to scheduler Koenig, Christian @ 2018-09-18 23:13 ` Nayan Deshmukh 0 siblings, 0 replies; 9+ messages in thread From: Nayan Deshmukh @ 2018-09-18 23:13 UTC (permalink / raw) To: Maling list - DRI developers [-- Attachment #1.1: Type: text/plain, Size: 4769 bytes --] Hi Christian, Yes you are correct. My bad. Do you have any comments on the second patch? I will drop this patch and rebase the second one. Regards, Nayan On Wed, Sep 19, 2018, 2:09 AM Koenig, Christian <Christian.Koenig@amd.com> wrote: > Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh: > > Which points to the job running on the hardware. This is > > useful when we need to access the currently executing job > > from the scheduler. > > That should be identical to > list_first_entry_or_null(&sched->ring_mirror_list), doesn't it? > > Regards, > Christian. > > > > > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com> > > ---list_first_entry_or_null(&sched->ring_mirror_list > > > > drivers/gpu/drm/scheduler/sched_main.c | 17 +++++++++++------ > > include/drm/gpu_scheduler.h | 2 ++ > > 2 files changed, 13 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > b/drivers/gpu/drm/scheduler/sched_main.c > > index 9ca741f3a0bc..0e6ccc8243db 100644 > > --- a/drivers/gpu/drm/scheduler/sched_main.c > > +++ b/drivers/gpu/drm/scheduler/sched_main.c > > @@ -189,6 +189,7 @@ static void drm_sched_job_finish(struct work_struct > *work) > > struct drm_sched_job *s_job = container_of(work, struct > drm_sched_job, > > finish_work); > > struct drm_gpu_scheduler *sched = s_job->sched; > > + struct drm_sched_job *next; > > > > /* > > * Canceling the timeout without removing our job from the ring > mirror > > @@ -201,10 +202,10 @@ static void drm_sched_job_finish(struct > work_struct *work) > > > > spin_lock(&sched->job_list_lock); > > /* queue TDR for next job */ > > + next = list_next_entry(s_job, node); > > + sched->curr_job = next; > > if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > > !list_is_last(&s_job->node, &sched->ring_mirror_list)) { > > - struct drm_sched_job *next = list_next_entry(s_job, node); > > - > > if (!dma_fence_is_signaled(&next->s_fence->finished)) > > schedule_delayed_work(&next->work_tdr, > sched->timeout); > > } > > @@ -233,10 +234,12 @@ static void drm_sched_job_begin(struct > drm_sched_job *s_job) > > > > spin_lock(&sched->job_list_lock); > > list_add_tail(&s_job->node, &sched->ring_mirror_list); > > - if (sched->timeout != MAX_SCHEDULE_TIMEOUT && > > - list_first_entry_or_null(&sched->ring_mirror_list, > > - struct drm_sched_job, node) == s_job) > > - schedule_delayed_work(&s_job->work_tdr, sched->timeout); > > + if (list_first_entry_or_null(&sched->ring_mirror_list, > > + struct drm_sched_job, node) == s_job) { > > + if (sched->timeout != MAX_SCHEDULE_TIMEOUT) > > + schedule_delayed_work(&s_job->work_tdr, > sched->timeout); > > + sched->curr_job = s_job; > > + } > > spin_unlock(&sched->job_list_lock); > > } > > > > @@ -316,6 +319,8 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler > *sched) > > struct drm_sched_job, node); > > if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) > > schedule_delayed_work(&s_job->work_tdr, sched->timeout); > > + if (s_job) > > + sched->curr_job = s_job; > > > > list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, > node) { > > struct drm_sched_fence *s_fence = s_job->s_fence; > > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > > index daec50f887b3..07e776b1ca42 100644 > > --- a/include/drm/gpu_scheduler.h > > +++ b/include/drm/gpu_scheduler.h > > @@ -252,6 +252,7 @@ struct drm_sched_backend_ops { > > * @timeout: the time after which a job is removed from the scheduler. > > * @name: name of the ring for which this scheduler is being used. > > * @sched_rq: priority wise array of run queues. > > + * @curr_job: points to the job currently running on the hardware > > * @wake_up_worker: the wait queue on which the scheduler sleeps until > a job > > * is ready to be scheduled. > > * @job_scheduled: once @drm_sched_entity_do_release is called the > scheduler > > @@ -274,6 +275,7 @@ struct drm_gpu_scheduler { > > long timeout; > > const char *name; > > struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_MAX]; > > + struct drm_sched_job *curr_job; > > wait_queue_head_t wake_up_worker; > > wait_queue_head_t job_scheduled; > > atomic_t hw_rq_count; > > [-- Attachment #1.2: Type: text/html, Size: 6470 bytes --] [-- Attachment #2: Type: text/plain, Size: 160 bytes --] _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2018-09-20 11:30 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2018-09-18 16:17 [PATCH 1/2] drm/scheduler: add a current job field to scheduler Nayan Deshmukh 2018-09-18 16:17 ` [PATCH 2/2] drm/scheduler: remove timeout work_struct from drm_sched_job Nayan Deshmukh 2018-09-19 12:30 ` Christian König 2018-09-19 15:39 ` Michel Dänzer 2018-09-20 6:41 ` Christian König 2018-09-20 11:25 ` Nayan Deshmukh 2018-09-20 11:30 ` Christian König 2018-09-18 17:09 ` [PATCH 1/2] drm/scheduler: add a current job field to scheduler Koenig, Christian 2018-09-18 23:13 ` Nayan Deshmukh
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).