Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: John Harrison <john.c.harrison@intel.com>
To: "Ceraolo Spurio, Daniele" <daniele.ceraolospurio@intel.com>,
	<Intel-GFX@Lists.FreeDesktop.Org>
Cc: DRI-Devel@Lists.FreeDesktop.Org
Subject: Re: [Intel-gfx] [PATCH 2/2] drm/i915/guc: Fix missing return code checks in submission init
Date: Fri, 17 Feb 2023 12:40:52 -0800	[thread overview]
Message-ID: <bc45f128-bb22-7b35-d931-22e13a7902c0@intel.com> (raw)
In-Reply-To: <00d0e2c1-3187-7025-c8eb-821bee0d0f45@intel.com>

On 1/24/2023 17:01, Ceraolo Spurio, Daniele wrote:
> On 1/11/2023 5:54 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The CI results for the 'fast request' patch set (enables error return
>> codes for fire-and-forget H2G messages) hit an issue with the KMD
>> sending context submission requests on an invalid context. That was
>> caused by a fault injection probe failing the context creation of a
>> kernel context. However, there was no return code checking on any of
>> the kernel context registration paths. So the driver kept going and
>> tried to use the kernel context for the record defaults process.
>>
>> This would not cause any actual problems. The invalid requests would
>> be rejected by GuC and ultimately the start up sequence would
>> correctly wedge due to the context creation failure. But fixing the
>> issue correctly rather ignoring it means we won't get CI complaining
>> when the fast request patch lands and enables the extra error checking.
>>
>> So fix it by checking for errors and aborting as appropriate when
>> creating kernel contexts. While at it, clean up some other submission
>> init related failure cleanup paths. Also, rename guc_init_lrc_mapping
>> to guc_init_submission as the former name hasn't been valid in a long
>> time.
>>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 91 ++++++++++++++-----
>>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  2 +-
>>   drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 +-
>>   3 files changed, 75 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 982364777d0c6..dd856fd92945b 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -1431,7 +1431,7 @@ static int guc_action_enable_usage_stats(struct 
>> intel_guc *guc)
>>       return intel_guc_send(guc, action, ARRAY_SIZE(action));
>>   }
>>   -static void guc_init_engine_stats(struct intel_guc *guc)
>> +static int guc_init_engine_stats(struct intel_guc *guc)
>>   {
>>       struct intel_gt *gt = guc_to_gt(guc);
>>       intel_wakeref_t wakeref;
>> @@ -1447,6 +1447,8 @@ static void guc_init_engine_stats(struct 
>> intel_guc *guc)
>>           cancel_delayed_work_sync(&guc->timestamp.work);
>>           drm_err(&gt->i915->drm, "Failed to enable usage stats: 
>> %d!\n", ret);
>>       }
>> +
>> +    return ret;
>>   }
>>     static void guc_park_engine_stats(struct intel_guc *guc)
>> @@ -4108,9 +4110,11 @@ static void guc_set_default_submission(struct 
>> intel_engine_cs *engine)
>>       engine->submit_request = guc_submit_request;
>>   }
>>   -static inline void guc_kernel_context_pin(struct intel_guc *guc,
>> -                      struct intel_context *ce)
>> +static inline int guc_kernel_context_pin(struct intel_guc *guc,
>> +                     struct intel_context *ce)
>>   {
>> +    int ret;
>> +
>>       /*
>>        * Note: we purposefully do not check the returns below because
>>        * the registration can only fail if a reset is just starting.
>> @@ -4118,16 +4122,24 @@ static inline void 
>> guc_kernel_context_pin(struct intel_guc *guc,
>>        * isn't happening and even it did this code would be run again.
>>        */
>>   -    if (context_guc_id_invalid(ce))
>> -        pin_guc_id(guc, ce);
>> +    if (context_guc_id_invalid(ce)) {
>> +        int ret = pin_guc_id(guc, ce);
>
> Why do you need a local ret variable inside this if statement, when 
> you already have a function-level one? or is it just a cut & paste error?
Yeah, copy/paste thing.

>
>> +
>> +        if (ret < 0)
>> +            return ret;
>> +    }
>>         if (!test_bit(CONTEXT_GUC_INIT, &ce->flags))
>>           guc_context_init(ce);
>>   -    try_context_registration(ce, true);
>> +    ret = try_context_registration(ce, true);
>> +    if (ret)
>> +        unpin_guc_id(guc, ce);
>> +
>> +    return ret;
>>   }
>>   -static inline void guc_init_lrc_mapping(struct intel_guc *guc)
>> +static inline int guc_init_submission(struct intel_guc *guc)
>>   {
>>       struct intel_gt *gt = guc_to_gt(guc);
>>       struct intel_engine_cs *engine;
>> @@ -4154,9 +4166,17 @@ static inline void guc_init_lrc_mapping(struct 
>> intel_guc *guc)
>>           struct intel_context *ce;
>>             list_for_each_entry(ce, &engine->pinned_contexts_list,
>> -                    pinned_contexts_link)
>> -            guc_kernel_context_pin(guc, ce);
>> +                    pinned_contexts_link) {
>> +            int ret = guc_kernel_context_pin(guc, ce);
>> +
>> +            if (ret) {
>> +                /* No point in trying to clean up as i915 will wedge 
>> on failure */
>> +                return ret;
>> +            }
>> +        }
>>       }
>> +
>> +    return 0;
>>   }
>>     static void guc_release(struct intel_engine_cs *engine)
>> @@ -4400,30 +4420,57 @@ static int 
>> guc_init_global_schedule_policy(struct intel_guc *guc)
>>       return ret;
>>   }
>>   -void intel_guc_submission_enable(struct intel_guc *guc)
>> +static void guc_route_semaphores(struct intel_guc *guc, bool to_guc)
>>   {
>>       struct intel_gt *gt = guc_to_gt(guc);
>> +    u32 val;
>>   -    /* Enable and route to GuC */
>> -    if (GRAPHICS_VER(gt->i915) >= 12)
>> -        intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES,
>> -                   GUC_SEM_INTR_ROUTE_TO_GUC |
>> -                   GUC_SEM_INTR_ENABLE_ALL);
>> +    if (GRAPHICS_VER(gt->i915) < 12)
>> +        return;
>>   -    guc_init_lrc_mapping(guc);
>> -    guc_init_engine_stats(guc);
>> -    guc_init_global_schedule_policy(guc);
>> +    if (to_guc)
>> +        val = GUC_SEM_INTR_ROUTE_TO_GUC | GUC_SEM_INTR_ENABLE_ALL;
>> +    else
>> +        val = 0;
>> +
>> +    intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES, val);
>> +}
>> +
>> +int intel_guc_submission_enable(struct intel_guc *guc)
>> +{
>> +    int ret;
>> +
>> +    /* Semaphore interrupt enable and route to GuC */
>> +    guc_route_semaphores(guc, true);
>> +
>> +    ret = guc_init_submission(guc);
>> +    if (ret)
>> +        goto fail_sem;
>> +
>> +    ret = guc_init_engine_stats(guc);
>> +    if (ret)
>> +        goto fail_sem;
>> +
>> +    ret = guc_init_global_schedule_policy(guc);
>> +    if (ret)
>> +        goto fail_stats;
>> +
>> +    return 0;
>> +
>> +fail_stats:
>> +    guc_park_engine_stats(guc);
>
> personal preference: I'd prefer an extra guc_fini_engine_stats wrapper 
> just so that we're balanced with the naming, but not a blocker.
As per comment on previous patch, I'm thinking I'll rename 
guc_park_engine_stats() to guc_cancel_busyness_worker() (and add a 
matching enable). And I agree on the naming here. But adding yet another 
wrapper would mean having this:
     guc_fini_engine_stats() {
         guc_cancel_busyness_worker() {
             cancel_delayed_work_sync();
         }
     }

Which seems excessive. A wrapper around a wrapper around a one line 
piece of code. I guess the compiler will optimise it all out and it 
leaves room for future expansion if other things need to happen in the 
middle layers in the future.

John.

>
> Daniele
>
>> +fail_sem:
>> +    guc_route_semaphores(guc, false);
>> +    return ret;
>>   }
>>     /* Note: By the time we're here, GuC may have already been reset */
>>   void intel_guc_submission_disable(struct intel_guc *guc)
>>   {
>> -    struct intel_gt *gt = guc_to_gt(guc);
>>       guc_park_engine_stats(guc);
>>   -    /* Disable and route to host */
>> -    if (GRAPHICS_VER(gt->i915) >= 12)
>> -        intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES, 
>> 0x0);
>> +    /* Semaphore interrupt disable and route to host */
>> +    guc_route_semaphores(guc, false);
>>   }
>>     static bool __guc_submission_supported(struct intel_guc *guc)
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>> index 5a95a9f0a8e31..c57b29cdb1a64 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
>> @@ -15,7 +15,7 @@ struct intel_engine_cs;
>>     void intel_guc_submission_init_early(struct intel_guc *guc);
>>   int intel_guc_submission_init(struct intel_guc *guc);
>> -void intel_guc_submission_enable(struct intel_guc *guc);
>> +int intel_guc_submission_enable(struct intel_guc *guc);
>>   void intel_guc_submission_disable(struct intel_guc *guc);
>>   void intel_guc_submission_fini(struct intel_guc *guc);
>>   int intel_guc_preempt_work_create(struct intel_guc *guc);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> index 9a8a1abf71d7f..8e338b3321a93 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> @@ -537,8 +537,11 @@ static int __uc_init_hw(struct intel_uc *uc)
>>       else
>>           intel_huc_auth(huc);
>>   -    if (intel_uc_uses_guc_submission(uc))
>> -        intel_guc_submission_enable(guc);
>> +    if (intel_uc_uses_guc_submission(uc)) {
>> +        ret = intel_guc_submission_enable(guc);
>> +        if (ret)
>> +            goto err_log_capture;
>> +    }
>>         if (intel_uc_uses_guc_slpc(uc)) {
>>           ret = intel_guc_slpc_enable(&guc->slpc);
>


  reply	other threads:[~2023-02-17 20:41 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-12  1:54 [Intel-gfx] [PATCH 0/2] Clean up some GuC related failure paths John.C.Harrison
2023-01-12  1:54 ` [Intel-gfx] [PATCH 1/2] drm/i915/guc: Improve clean up of busyness stats worker John.C.Harrison
2023-01-25  0:55   ` Ceraolo Spurio, Daniele
2023-02-17 20:13     ` John Harrison
2023-01-12  1:54 ` [Intel-gfx] [PATCH 2/2] drm/i915/guc: Fix missing return code checks in submission init John.C.Harrison
2023-01-25  1:01   ` Ceraolo Spurio, Daniele
2023-02-17 20:40     ` John Harrison [this message]
2023-01-12  2:16 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for Clean up some GuC related failure paths Patchwork
2023-01-12  2:33 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2023-01-12  4:02 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bc45f128-bb22-7b35-d931-22e13a7902c0@intel.com \
    --to=john.c.harrison@intel.com \
    --cc=DRI-Devel@Lists.FreeDesktop.Org \
    --cc=Intel-GFX@Lists.FreeDesktop.Org \
    --cc=daniele.ceraolospurio@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox