Re: [PATCH v2] drm/sched: Add test suite for concurrent job submissions

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Marco Pagani <marco.pagani@linux.dev>
To: "Tvrtko Ursulin" <tursulin@ursulin.net>,
	"Matthew Brost" <matthew.brost@intel.com>,
	"Danilo Krummrich" <dakr@kernel.org>,
	"Philipp Stanner" <phasta@kernel.org>,
	"Christian König" <ckoenig.leichtzumerken@gmail.com>,
	"Maarten Lankhorst" <maarten.lankhorst@linux.intel.com>,
	"Maxime Ripard" <mripard@kernel.org>,
	"Thomas Zimmermann" <tzimmermann@suse.de>,
	"David Airlie" <airlied@gmail.com>,
	"Simona Vetter" <simona@ffwll.ch>
Cc: dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2] drm/sched: Add test suite for concurrent job submissions
Date: Fri, 10 Apr 2026 12:30:43 +0200	[thread overview]
Message-ID: <163a3faa-639e-409f-942e-68caf37ea7ff@linux.dev> (raw)
In-Reply-To: <cd8f23ce-9c79-4ba8-85dc-735fe6e89d6d@ursulin.net>



On 10/04/2026 09:41, Tvrtko Ursulin wrote:
> 
> On 09/04/2026 16:35, Tvrtko Ursulin wrote:
>>
>> On 08/04/2026 16:49, Marco Pagani wrote:
>>> Add a new test suite to simulate concurrent job submissions to the DRM
>>> scheduler, as this functionality is not covered by current test suites.
>>>
>>> The new test suite includes two initial test cases: (i) a test case for
>>> parallel job submission and (ii) a test case for interleaved job
>>> submission and completion. In the first test case, worker threads
>>> concurrently submit jobs to the scheduler, and then the timeline is
>>> manually advanced to complete them in bulk. In the second test case,
>>> worker threads concurrently submit sequences of jobs of different
>>> durations to the mock scheduler using a sliding window to better model
>>> real-world workloads. The timeline is advanced automatically by the
>>> finishing jobs, interleaving submission with completion.
>>>
>>> Signed-off-by: Marco Pagani <marco.pagani@linux.dev>
>>> ---
>>> Changes in v2:
>>> - Improved test description
>>> - Use multiple job durations instead of harmonic periods/durations
>>> - Improved submission for interleaved test with a sliding window
>>> - Removed unnecessary asserts per Tvrtko's feedback, but kept 
>>> wait_scheduled
>>> - Changed parameter names from period to duration for clarity
>>> - Used temp variables to reduce line breaks
>>> ---
>>>   drivers/gpu/drm/scheduler/tests/tests_basic.c | 356 ++++++++++++++++++
>>>   1 file changed, 356 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/tests/tests_basic.c b/drivers/ 
>>> gpu/drm/scheduler/tests/tests_basic.c
>>> index a5a5a35a87b0..1791b157cfc8 100644
>>> --- a/drivers/gpu/drm/scheduler/tests/tests_basic.c
>>> +++ b/drivers/gpu/drm/scheduler/tests/tests_basic.c
>>> @@ -1,7 +1,11 @@
>>>   // SPDX-License-Identifier: GPL-2.0
>>>   /* Copyright (c) 2025 Valve Corporation */
>>> +#include <linux/completion.h>
>>>   #include <linux/delay.h>
>>> +#include <linux/minmax.h>
>>> +#include <linux/time.h>
>>> +#include <linux/workqueue.h>
>>>   #include "sched_tests.h"
>>> @@ -235,6 +239,357 @@ static void drm_sched_basic_cancel(struct kunit 
>>> *test)
>>>       KUNIT_ASSERT_EQ(test, job->hw_fence.error, -ECANCELED);
>>>   }
>>> +struct sched_concurrent_context {
>>> +    struct drm_mock_scheduler *sched;
>>> +    struct workqueue_struct *sub_wq;
>>> +    struct kunit *test;
>>> +    struct completion wait_go;
>>> +};
>>> +
>>> +KUNIT_DEFINE_ACTION_WRAPPER(drm_mock_sched_fini_wrap, 
>>> drm_mock_sched_fini,
>>> +                struct drm_mock_scheduler *);
>>> +
>>> +KUNIT_DEFINE_ACTION_WRAPPER(drm_mock_sched_entity_free_wrap, 
>>> drm_mock_sched_entity_free,
>>> +                struct drm_mock_sched_entity *);
>>> +
>>> +static void complete_destroy_workqueue(void *context)
>>> +{
>>> +    struct sched_concurrent_context *ctx = context;
>>> +
>>> +    complete_all(&ctx->wait_go);
>>> +
>>> +    destroy_workqueue(ctx->sub_wq);
>>> +}
>>> +
>>> +static int drm_sched_concurrent_init(struct kunit *test)
>>> +{
>>> +    struct sched_concurrent_context *ctx;
>>> +    int ret;
>>> +
>>> +    ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
>>> +    KUNIT_ASSERT_NOT_NULL(test, ctx);
>>> +
>>> +    init_completion(&ctx->wait_go);
>>> +
>>> +    ctx->sched = drm_mock_sched_new(test, MAX_SCHEDULE_TIMEOUT);
>>> +
>>> +    ret = kunit_add_action_or_reset(test, drm_mock_sched_fini_wrap, 
>>> ctx->sched);
>>> +    KUNIT_ASSERT_EQ(test, ret, 0);
>>> +
>>> +    /* Use an unbounded workqueue to maximize job submission 
>>> concurrency */
>>> +    ctx->sub_wq = alloc_workqueue("drm-sched-submitters-wq", WQ_UNBOUND,
>>> +                      WQ_UNBOUND_MAX_ACTIVE);
>>> +    KUNIT_ASSERT_NOT_NULL(test, ctx->sub_wq);
>>> +
>>> +    ret = kunit_add_action_or_reset(test, complete_destroy_workqueue, 
>>> ctx);
>>> +    KUNIT_ASSERT_EQ(test, ret, 0);
>>> +
>>> +    ctx->test = test;
>>> +    test->priv = ctx;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +struct drm_sched_parallel_params {
>>> +    const char *description;
>>> +    unsigned int num_jobs;
>>> +    unsigned int num_workers;
>>> +};
>>> +
>>> +static const struct drm_sched_parallel_params 
>>> drm_sched_parallel_cases[] = {
>>> +    {
>>> +        .description = "Parallel submission of multiple jobs per 
>>> worker",
>>> +        .num_jobs = 8,
>>> +        .num_workers = 16,
>>> +    },
>>> +};
>>> +
>>> +static void
>>> +drm_sched_parallel_desc(const struct drm_sched_parallel_params 
>>> *params, char *desc)
>>> +{
>>> +    strscpy(desc, params->description, KUNIT_PARAM_DESC_SIZE);
>>> +}
>>> +
>>> +KUNIT_ARRAY_PARAM(drm_sched_parallel, drm_sched_parallel_cases, 
>>> drm_sched_parallel_desc);
>>
>> As long as you don't plan to add more test cases any time soon the 
>> boiler plate could be reduced by unot using KUNIT_ARRAY_PARAM but it is 
>> up to you. (Same for the other test below.)
>>
>>> +
>>> +struct parallel_worker {
>>> +    struct work_struct work;
>>> +    struct sched_concurrent_context *ctx;
>>> +    struct drm_mock_sched_entity *entity;
>>> +    struct drm_mock_sched_job **jobs;
>>> +    unsigned int id;
>>> +};
>>> +
>>> +static void drm_sched_parallel_worker(struct work_struct *work)
>>> +{
>>> +    const struct drm_sched_parallel_params *params;
>>> +    struct sched_concurrent_context *test_ctx;
>>> +    struct parallel_worker *worker;
>>> +    unsigned int i;
>>> +
>>> +    worker = container_of(work, struct parallel_worker, work);
>>> +    test_ctx = worker->ctx;
>>> +    params = test_ctx->test->param_value;
>>> +
>>> +    wait_for_completion(&test_ctx->wait_go);
>>> +
>>> +    kunit_info(test_ctx->test, "Parallel worker %u submitting %u jobs 
>>> started\n",
>>> +           worker->id, params->num_jobs);
>>> +
>>> +    for (i = 0; i < params->num_jobs; i++)
>>> +        drm_mock_sched_job_submit(worker->jobs[i]);
>>> +}
>>> +
>>> +/*
>>> + * Spawns workers that submit a sequence of jobs to the mock scheduler.
>>> + * Once all jobs are submitted, the timeline is manually advanced.
>>> + */
>>> +static void drm_sched_parallel_submit_test(struct kunit *test)
>>> +{
>>> +    struct sched_concurrent_context *ctx = test->priv;
>>> +    const struct drm_sched_parallel_params *params = test->param_value;
>>> +    struct parallel_worker *workers, *worker;
>>> +    struct drm_mock_sched_job *job;
>>> +    unsigned int i, j, completed_jobs, total_jobs;
>>> +    bool done;
>>> +    int ret;
>>> +
>>> +    KUNIT_ASSERT_GT(test, params->num_workers, 0);
>>> +    KUNIT_ASSERT_GT(test, params->num_jobs, 0);
>>> +
>>> +    workers = kunit_kcalloc(test, params->num_workers, sizeof(*workers),
>>> +                GFP_KERNEL);
>>> +    KUNIT_ASSERT_NOT_NULL(test, workers);
>>> +
>>> +    /*
>>> +     * Init workers only after all jobs and entities have been 
>>> successfully
>>> +     * allocated. In this way, the cleanup logic for when an 
>>> assertion fail
>>> +     * can be simplified.
>>> +     */
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        worker->id = i;
>>> +        worker->ctx = ctx;
>>> +        worker->entity = drm_mock_sched_entity_new(test,
>>> +                               DRM_SCHED_PRIORITY_NORMAL,
>>> +                               ctx->sched);
>>> +
>>> +        ret = kunit_add_action_or_reset(test, 
>>> drm_mock_sched_entity_free_wrap,
>>> +                        worker->entity);
>>> +        KUNIT_ASSERT_EQ(test, ret, 0);
>>> +
>>> +        worker->jobs = kunit_kcalloc(test, params->num_jobs,
>>> +                         sizeof(*worker->jobs), GFP_KERNEL);
>>> +        KUNIT_ASSERT_NOT_NULL(test, worker->jobs);
>>> +
>>> +        for (j = 0; j < params->num_jobs; j++) {
>>> +            job = drm_mock_sched_job_new(test, worker->entity);
>>> +            worker->jobs[j] = job;
>>> +        }
>>> +    }
>>> +
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        INIT_WORK(&worker->work, drm_sched_parallel_worker);
>>> +        queue_work(ctx->sub_wq, &worker->work);
>>> +    }
>>> +
>>> +    complete_all(&ctx->wait_go);
>>> +    flush_workqueue(ctx->sub_wq);
>>> +
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        for (j = 0; j < params->num_jobs; j++) {
>>> +            job = worker->jobs[j];
>>> +            done = drm_mock_sched_job_wait_scheduled(job, HZ);
>>> +            KUNIT_EXPECT_TRUE(test, done);
>>> +        }
>>> +    }
>>> +
>>> +    total_jobs = params->num_workers * params->num_jobs;
>>> +    completed_jobs = drm_mock_sched_advance(ctx->sched, total_jobs);
>>> +    KUNIT_EXPECT_EQ(test, completed_jobs, total_jobs);
>>> +
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        for (j = 0; j < params->num_jobs; j++) {
>>> +            job = worker->jobs[j];
>>> +            done = drm_mock_sched_job_wait_finished(job, HZ);
>>> +            KUNIT_EXPECT_TRUE(test, done);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +struct drm_sched_interleaved_params {
>>> +    const char *description;
>>> +    unsigned int test_duration_ms;
>>> +    unsigned int job_base_duration_us;
>>> +    unsigned int num_workers;
>>> +    unsigned int num_in_flight_jobs;
>>> +};
>>> +
>>> +static const struct drm_sched_interleaved_params 
>>> drm_sched_interleaved_cases[] = {
>>> +    {
>>> +        .description = "Interleaved submission of multiple jobs per 
>>> worker",
>>> +        .test_duration_ms = 1000,
>>> +        .job_base_duration_us = 100,
>>> +        .num_workers = 16,
>>> +        .num_in_flight_jobs = 8,
>>> +    },
>>> +};
>>> +
>>> +static void
>>> +drm_sched_interleaved_desc(const struct drm_sched_interleaved_params 
>>> *params, char *desc)
>>> +{
>>> +    strscpy(desc, params->description, KUNIT_PARAM_DESC_SIZE);
>>> +}
>>> +
>>> +KUNIT_ARRAY_PARAM(drm_sched_interleaved, drm_sched_interleaved_cases,
>>> +          drm_sched_interleaved_desc);
>>> +
>>> +struct interleaved_worker {
>>> +    struct work_struct work;
>>> +    struct sched_concurrent_context *ctx;
>>> +    struct drm_mock_sched_entity *entity;
>>> +    struct drm_mock_sched_job **jobs;
>>> +    unsigned int id;
>>> +    unsigned int job_count;
>>> +    unsigned int job_duration_us;
>>> +};
>>> +
>>> +static void drm_sched_interleaved_worker(struct work_struct *work)
>>> +{
>>> +    struct sched_concurrent_context *test_ctx;
>>> +    const struct drm_sched_interleaved_params *params;
>>> +    struct interleaved_worker *worker;
>>> +    unsigned int i, j, max_in_flight_job;
>>> +    unsigned long timeout;
>>> +    bool done;
>>> +
>>> +    worker = container_of(work, struct interleaved_worker, work);
>>> +    test_ctx = worker->ctx;
>>> +    params = test_ctx->test->param_value;
>>> +
>>> +    wait_for_completion(&test_ctx->wait_go);
>>> +
>>> +    kunit_info(test_ctx->test, "Worker %u submitting %u jobs of %u us 
>>> started\n",
>>> +           worker->id, worker->job_count, worker->job_duration_us);
>>> +
>>> +    timeout = msecs_to_jiffies(params->test_duration_ms * 2);
>>> +
>>> +    /* Fill the submission window */
>>> +    max_in_flight_job = min(worker->job_count, params- 
>>>> num_in_flight_jobs);
>>> +    for (i = 0; i < max_in_flight_job; i++)
>>> +        drm_mock_sched_job_submit(worker->jobs[i]);
>>> +
>>> +    /* Keep the window full by submitting a new job at once until 
>>> done */
>>> +    for (i = 0; i < worker->job_count; i++) {
>>> +        done = drm_mock_sched_job_wait_finished(worker->jobs[i], 
>>> timeout);
>>> +        if (!done)
>>> +            kunit_info(test_ctx->test, "Job %u of worker %u timed 
>>> out\n",
>>> +                   i, worker->id);
>>> +
>>> +        j = i + max_in_flight_job;
>>> +        if (j < worker->job_count)
>>> +            drm_mock_sched_job_submit(worker->jobs[j]);
>>> +    }
>>
>> The loop maybe isn't the most straight-forward but eventually looked to 
>> me like it will do what it wants.
> 
> One idea to replace two loops with a single hopefully clearer one:
> 
> for (i = 0; i < worker->job_count; i++) {
> 	if (i >= max_in_flight_job)
> 		drm_mock_sched_job_wait_finished(worker->jobs[i - max_in_flight_job], 
> timeout);
> 	drm_mock_sched_job_submit(worker->jobs[i]);
> }
> 

It seems to me that this particular loop will not work as expected since
it will not wait for the last jobs.

Considering the feedback I received for v1, my intention was to avoid
using negative offset indexing (i - something) in loops to keep the code
simple and safe with respect to future extensions by avoiding accidental
index wraps.

However, it seems that I achieved the opposite effect. So, I'm open to
changing it in whatever way you and Philipp think is best.

Thanks,
Marco

>> num_in_flight_jobs as a test parameter is perhaps arbitrary? I am not 
>> sure what it aims to achieve versus if workers would just be submitting 
>> one by one, or perhaps two by two, in both cases loops would be simpler. 
>> Or even if num_in_flight_jobs would perhaps be automatically derived 
>> from the worker->id? If the goal is to simulate queue depth it sounds 
>> good to vary it.
>>
>> But these are minor points and up to you. The test now looks passably 
>> tidy as it is.
>>
>> Ah wait, please also check if this one needs to be marked as "slow" 
>> kunit test. AFAIR all that are expected to take more than 1s need that.
>>
>> Regards,
>>
>> Tvrtko
>>
>>> +}
>>> +
>>> +/*
>>> + * Spawns workers that submit a sequence of jobs to the mock 
>>> scheduler. Job
>>> + * durations are chosen as multiples of a base duration value 
>>> specified as
>>> + * a test parameter. Since the scheduler serializes jobs from all 
>>> workers,
>>> + * the total test duration budget is divided into equal shares among 
>>> workers.
>>> + * These shares are then used to compute the number of jobs that each 
>>> worker
>>> + * can submit
>>> + */
>>> +static void drm_sched_interleaved_submit_test(struct kunit *test)
>>> +{
>>> +    const struct drm_sched_interleaved_params *params = test- 
>>>> param_value;
>>> +    struct sched_concurrent_context *ctx = test->priv;
>>> +    struct interleaved_worker *workers, *worker;
>>> +    struct drm_mock_sched_job *job;
>>> +    unsigned int worker_share_us;
>>> +    unsigned int i, j;
>>> +    bool done;
>>> +    int ret;
>>> +
>>> +    KUNIT_ASSERT_GT(test, params->num_workers, 0);
>>> +    KUNIT_ASSERT_GT(test, params->job_base_duration_us, 0);
>>> +
>>> +    workers = kunit_kcalloc(test, params->num_workers, sizeof(*workers),
>>> +                GFP_KERNEL);
>>> +    KUNIT_ASSERT_NOT_NULL(test, workers);
>>> +
>>> +    /* Divide the available test time into equal shares among the 
>>> workers */
>>> +    worker_share_us = (params->test_duration_ms * USEC_PER_MSEC) /
>>> +              params->num_workers;
>>> +
>>> +    /*
>>> +     * Init workers only after all jobs and entities have been 
>>> successfully
>>> +     * allocated. In this way, the cleanup logic for when an 
>>> assertion fails
>>> +     * can be simplified.
>>> +     */
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        worker->id = i;
>>> +        worker->ctx = ctx;
>>> +
>>> +        worker->job_duration_us = params->job_base_duration_us * (i + 
>>> 1);
>>> +        worker->job_count = worker_share_us / worker->job_duration_us;
>>> +        worker->job_count = max(1U, worker->job_count);
>>> +
>>> +        worker->entity = drm_mock_sched_entity_new(test,
>>> +                               DRM_SCHED_PRIORITY_NORMAL,
>>> +                               ctx->sched);
>>> +
>>> +        ret = kunit_add_action_or_reset(test, 
>>> drm_mock_sched_entity_free_wrap,
>>> +                        worker->entity);
>>> +        KUNIT_ASSERT_EQ(test, ret, 0);
>>> +
>>> +        worker->jobs = kunit_kcalloc(test, worker->job_count,
>>> +                         sizeof(*worker->jobs), GFP_KERNEL);
>>> +        KUNIT_ASSERT_NOT_NULL(test, worker->jobs);
>>> +
>>> +        for (j = 0; j < worker->job_count; j++) {
>>> +            job = drm_mock_sched_job_new(test, worker->entity);
>>> +            drm_mock_sched_job_set_duration_us(job, worker- 
>>>> job_duration_us);
>>> +
>>> +            worker->jobs[j] = job;
>>> +        }
>>> +    }
>>> +
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        INIT_WORK(&worker->work, drm_sched_interleaved_worker);
>>> +        queue_work(ctx->sub_wq, &worker->work);
>>> +    }
>>> +
>>> +    complete_all(&ctx->wait_go);
>>> +    flush_workqueue(ctx->sub_wq);
>>> +
>>> +    for (i = 0; i < params->num_workers; i++) {
>>> +        worker = &workers[i];
>>> +        for (j = 0; j < worker->job_count; j++) {
>>> +            job = worker->jobs[j];
>>> +            done = drm_mock_sched_job_is_finished(job);
>>> +            KUNIT_EXPECT_TRUE(test, done);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static struct kunit_case drm_sched_concurrent_tests[] = {
>>> +    KUNIT_CASE_PARAM(drm_sched_parallel_submit_test, 
>>> drm_sched_parallel_gen_params),
>>> +    KUNIT_CASE_PARAM(drm_sched_interleaved_submit_test, 
>>> drm_sched_interleaved_gen_params),
>>> +    {}
>>> +};
>>> +
>>> +static struct kunit_suite drm_sched_concurrent = {
>>> +    .name = "drm_sched_concurrent_tests",
>>> +    .init = drm_sched_concurrent_init,
>>> +    .test_cases = drm_sched_concurrent_tests,
>>> +};
>>> +
>>>   static struct kunit_case drm_sched_cancel_tests[] = {
>>>       KUNIT_CASE(drm_sched_basic_cancel),
>>>       {}
>>> @@ -556,6 +911,7 @@ static struct kunit_suite drm_sched_credits = {
>>>   };
>>>   kunit_test_suites(&drm_sched_basic,
>>> +          &drm_sched_concurrent,
>>>             &drm_sched_timeout,
>>>             &drm_sched_cancel,
>>>             &drm_sched_priority,
>>
>

next prev parent reply	other threads:[~2026-04-10 10:30 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-08 15:49 [PATCH v2] drm/sched: Add test suite for concurrent job submissions Marco Pagani
2026-04-09 15:35 ` Tvrtko Ursulin
2026-04-10  7:41   ` Tvrtko Ursulin
2026-04-10 10:30     ` Marco Pagani [this message]
2026-04-10  9:07   ` Marco Pagani
2026-04-10  9:19     ` Philipp Stanner
2026-04-10 11:33       ` Marco Pagani

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=163a3faa-639e-409f-942e-68caf37ea7ff@linux.dev \
    --to=marco.pagani@linux.dev \
    --cc=airlied@gmail.com \
    --cc=ckoenig.leichtzumerken@gmail.com \
    --cc=dakr@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=maarten.lankhorst@linux.intel.com \
    --cc=matthew.brost@intel.com \
    --cc=mripard@kernel.org \
    --cc=phasta@kernel.org \
    --cc=simona@ffwll.ch \
    --cc=tursulin@ursulin.net \
    --cc=tzimmermann@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox