Linux PCI subsystem development
 help / color / mirror / Atom feed
From: Joel Fernandes <joelagnelf@nvidia.com>
To: Alexandre Courbot <acourbot@nvidia.com>
Cc: "Danilo Krummrich" <dakr@kernel.org>,
	"Alice Ryhl" <aliceryhl@google.com>,
	"David Airlie" <airlied@gmail.com>,
	"Simona Vetter" <simona@ffwll.ch>,
	"Bjorn Helgaas" <bhelgaas@google.com>,
	"Krzysztof Wilczyński" <kwilczynski@kernel.org>,
	"Miguel Ojeda" <ojeda@kernel.org>,
	"Boqun Feng" <boqun.feng@gmail.com>,
	"Gary Guo" <gary@garyguo.net>,
	"Björn Roy Baron" <bjorn3_gh@protonmail.com>,
	"Benno Lossin" <lossin@kernel.org>,
	"Andreas Hindborg" <a.hindborg@kernel.org>,
	"Trevor Gross" <tmgross@umich.edu>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Alistair Popple" <apopple@nvidia.com>,
	"Timur Tabi" <ttabi@nvidia.com>, "Edwin Peer" <epeer@nvidia.com>,
	"Eliot Courtney" <ecourtney@nvidia.com>,
	"nouveau@lists.freedesktop.org" <nouveau@lists.freedesktop.org>,
	"dri-devel@lists.freedesktop.org"
	<dri-devel@lists.freedesktop.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-pci@vger.kernel.org" <linux-pci@vger.kernel.org>,
	"rust-for-linux@vger.kernel.org" <rust-for-linux@vger.kernel.org>,
	"Alexandre Courbot" <acourbot@nvidia.com>
Subject: Re: [PATCH 6/7] gpu: nova-core: send UNLOADING_GUEST_DRIVER GSP command GSP upon unloading
Date: Tue, 16 Dec 2025 15:39:30 +0000	[thread overview]
Message-ID: <C890CCBB-76C0-4E70-A7B8-846E34DABECE@nvidia.com> (raw)
In-Reply-To: <20251216-nova-unload-v1-6-6a5d823be19d@nvidia.com>

Hi Alex,

Just did a quick pass, will do a proper review later:

> On Dec 16, 2025, at 12:13 AM, Alexandre Courbot <acourbot@nvidia.com> wrote:
> 
> Currently, the GSP is left running after the driver is unbound. This is
> not great for several reasons, notably that it can still access shared
> memory areas that the kernel will now reclaim (especially problematic on
> setups without an IOMMU).
> 
> Fix this by sending the `UNLOADING_GUEST_DRIVER` GSP command when
> unbinding. This stops the GSP and let us proceed with the rest of the
> unbind sequence in the next patch.
> 
> Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
> ---
> drivers/gpu/nova-core/gpu.rs                      |  5 +++
> drivers/gpu/nova-core/gsp/boot.rs                 | 42 +++++++++++++++++++++++
> drivers/gpu/nova-core/gsp/commands.rs             | 42 +++++++++++++++++++++++
> drivers/gpu/nova-core/gsp/fw.rs                   |  4 +++
> drivers/gpu/nova-core/gsp/fw/commands.rs          | 27 +++++++++++++++
> drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs |  8 +++++
> 6 files changed, 128 insertions(+)
> 
> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
> index ef6ceced350e..b94784f57b36 100644
> --- a/drivers/gpu/nova-core/gpu.rs
> +++ b/drivers/gpu/nova-core/gpu.rs
> @@ -291,6 +291,9 @@ pub(crate) fn new<'a>(
> 
>     /// Called when the corresponding [`Device`](device::Device) is unbound.
>     ///
> +    /// Prepares the GPU for unbinding by shutting down the GSP and unregistering the sysmem flush
> +    /// memory page.
> +    ///
>     /// Note: This method must only be called from `Driver::unbind`.
>     pub(crate) fn unbind(self: Pin<&mut Self>, dev: &device::Device<device::Core>) {
>         let this = self.project();
> @@ -299,6 +302,8 @@ pub(crate) fn unbind(self: Pin<&mut Self>, dev: &device::Device<device::Core>) {
>             return;
>         };
> 
> +        let _ = kernel::warn_on_err!(this.gsp.unload(dev, bar, this.gsp_falcon,));
> +
>         this.sysmem_flush.unregister(bar);
>     }
> }
> diff --git a/drivers/gpu/nova-core/gsp/boot.rs b/drivers/gpu/nova-core/gsp/boot.rs
> index ca7efdaa753b..e12e1d3fd53f 100644
> --- a/drivers/gpu/nova-core/gsp/boot.rs
> +++ b/drivers/gpu/nova-core/gsp/boot.rs
> @@ -32,6 +32,7 @@
>     },
>     gpu::Chipset,
>     gsp::{
> +        cmdq::Cmdq,
>         commands,
>         sequencer::{
>             GspSequencer,
> @@ -231,4 +232,45 @@ pub(crate) fn boot(
> 
>         Ok(())
>     }
> +
> +    /// Shutdowns the GSP and wait until it is offline.
> +    fn shutdown_gsp(
> +        cmdq: &mut Cmdq,
> +        bar: &Bar0,
> +        gsp_falcon: &Falcon<Gsp>,
> +        suspend: bool,
> +    ) -> Result<()> {
> +        // Send command to shutdown GSP and wait for response.
> +        commands::unload_guest_driver(cmdq, bar, suspend)?;
> +
> +        // Wait until GSP signals it is suspended.
> +        const LIBOS_INTERRUPT_PROCESSOR_SUSPENDED: u32 = 0x8000_0000;
> +        read_poll_timeout(
> +            || Ok(gsp_falcon.read_mailbox0(bar)),
> +            |&mb0| mb0 == LIBOS_INTERRUPT_PROCESSOR_SUSPENDED,
> +            Delta::ZERO,
> +            Delta::from_secs(5),
> +        )
> +        .map(|_| ())
> +    }
> +
> +    /// Attempts to unload the GSP firmware.
> +    ///
> +    /// This stops all activity on the GSP.
> +    pub(crate) fn unload(
> +        self: Pin<&mut Self>,
> +        dev: &device::Device<device::Bound>,
> +        bar: &Bar0,
> +        gsp_falcon: &Falcon<Gsp>,
> +    ) -> Result {
> +        let this = self.project();
> +
> +        /* Shutdown the GSP */
> +
> +        Self::shutdown_gsp(this.cmdq, bar, gsp_falcon, false)
> +            .inspect_err(|e| dev_err!(dev, "unload guest driver failed: {:?}", e))?;
> +        dev_dbg!(dev, "GSP shut down\n");
> +
> +        Ok(())
> +    }
> }
> diff --git a/drivers/gpu/nova-core/gsp/commands.rs b/drivers/gpu/nova-core/gsp/commands.rs
> index 2050771f9b53..0bcfca8c7e42 100644
> --- a/drivers/gpu/nova-core/gsp/commands.rs
> +++ b/drivers/gpu/nova-core/gsp/commands.rs
> @@ -225,3 +225,45 @@ pub(crate) fn get_gsp_info(cmdq: &mut Cmdq, bar: &Bar0) -> Result<GetGspStaticIn
>         }
>     }
> }
> +
> +impl CommandToGsp for UnloadingGuestDriver {
> +    const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver;
> +    type Command = Self;
> +    type InitError = Infallible;
> +
> +    fn init(&self) -> impl Init<Self::Command, Self::InitError> {
> +        *self
> +    }
> +}
> +
> +pub(crate) struct UnloadingGuestDriverReply;
> +
> +impl MessageFromGsp for UnloadingGuestDriverReply {
> +    const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver;
> +    type InitError = Infallible;
> +    type Message = ();
> +
> +    fn read(
> +        _msg: &Self::Message,
> +        _sbuffer: &mut SBufferIter<array::IntoIter<&[u8], 2>>,
> +    ) -> Result<Self, Self::InitError> {
> +        Ok(UnloadingGuestDriverReply)
> +    }
> +}
> +
> +/// Send the [`UnloadingGuestDriver`] command to the GSP and await the reply.
> +pub(crate) fn unload_guest_driver(
> +    cmdq: &mut Cmdq,
> +    bar: &Bar0,
> +    suspend: bool,
> +) -> Result<UnloadingGuestDriverReply> {
> +    cmdq.send_command(bar, UnloadingGuestDriver::new(suspend))?;
> +
> +    loop {
> +        match cmdq.receive_msg::<UnloadingGuestDriverReply>(Delta::from_secs(5)) {
> +            Ok(resp) => return Ok(resp),
> +            Err(ERANGE) => continue,
> +            Err(e) => return Err(e),
> +        }

This outer loop can go on infinitely, lets bound it?

Either outer timeout or bounded number of tries. Bounded tries better since it has inner timeout.

> +    }
> +}
> diff --git a/drivers/gpu/nova-core/gsp/fw.rs b/drivers/gpu/nova-core/gsp/fw.rs
> index 09549f7db52d..228464fd47a0 100644
> --- a/drivers/gpu/nova-core/gsp/fw.rs
> +++ b/drivers/gpu/nova-core/gsp/fw.rs
> @@ -209,6 +209,7 @@ pub(crate) enum MsgFunction {
>     GspInitPostObjGpu = bindings::NV_VGPU_MSG_FUNCTION_GSP_INIT_POST_OBJGPU,
>     GspRmControl = bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL,
>     GetStaticInfo = bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO,
> +    UnloadingGuestDriver = bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER,
> 
>     // Event codes
>     GspInitDone = bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE,
> @@ -249,6 +250,9 @@ fn try_from(value: u32) -> Result<MsgFunction> {
>             }
>             bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL => Ok(MsgFunction::GspRmControl),
>             bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO => Ok(MsgFunction::GetStaticInfo),
> +            bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER => {
> +                Ok(MsgFunction::UnloadingGuestDriver)
> +            }
>             bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE => Ok(MsgFunction::GspInitDone),
>             bindings::NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER => {
>                 Ok(MsgFunction::GspRunCpuSequencer)
> diff --git a/drivers/gpu/nova-core/gsp/fw/commands.rs b/drivers/gpu/nova-core/gsp/fw/commands.rs
> index 85465521de32..c7df4783ad21 100644
> --- a/drivers/gpu/nova-core/gsp/fw/commands.rs
> +++ b/drivers/gpu/nova-core/gsp/fw/commands.rs
> @@ -125,3 +125,30 @@ unsafe impl AsBytes for GspStaticConfigInfo {}
> // SAFETY: This struct only contains integer types for which all bit patterns
> // are valid.
> unsafe impl FromBytes for GspStaticConfigInfo {}
> +
> +/// Payload of the `UnloadingGuestDriver` command and message.
> +#[repr(transparent)]
> +#[derive(Clone, Copy, Debug, Zeroable)]
> +pub(crate) struct UnloadingGuestDriver {
> +    inner: bindings::rpc_unloading_guest_driver_v1F_07,
> +}
> +
> +impl UnloadingGuestDriver {
> +    pub(crate) fn new(suspend: bool) -> Self {
> +        Self {
> +            inner: bindings::rpc_unloading_guest_driver_v1F_07 {

We should go through intermediate firmware representation than direct bindings access?


> +                bInPMTransition: if suspend { 1 } else { 0 },

Then this can just be passed as a bool.

> +                bGc6Entering: 0,
> +                newLevel: if suspend { 3 } else { 0 },
> +                ..Zeroable::zeroed()
> +            },
> +        }
> +    }
> +}
> +
> +// SAFETY: Padding is explicit and will not contain uninitialized data.
> +unsafe impl AsBytes for UnloadingGuestDriver {}
> +
> +// SAFETY: This struct only contains integer types for which all bit patterns
> +// are valid.
> +unsafe impl FromBytes for UnloadingGuestDriver {}
> diff --git a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> index 6d25fe0bffa9..212ccc13c0cc 100644
> --- a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> +++ b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> @@ -879,6 +879,14 @@ fn default() -> Self {
>     }
> }
> #[repr(C)]
> +#[derive(Debug, Default, Copy, Clone, MaybeZeroable)]
> +pub struct rpc_unloading_guest_driver_v1F_07 {
> +    pub bInPMTransition: u8_,
> +    pub bGc6Entering: u8_,
> +    pub __bindgen_padding_0: [u8; 2usize],
> +    pub newLevel: u32_,

When these are intermediate represented, documentation would be nice on the fields.

thanks,

 - Joel

> +}
> +#[repr(C)]
> #[derive(Debug, Default, MaybeZeroable)]
> pub struct rpc_run_cpu_sequencer_v17_00 {
>     pub bufferSizeDWord: u32_,
> 
> --
> 2.52.0
> 

  reply	other threads:[~2025-12-16 15:39 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-16  5:13 [PATCH 0/7] gpu: nova-core: run unload sequence upon unbinding Alexandre Courbot
2025-12-16  5:13 ` [PATCH 1/7] rust: pci: pass driver data by value to `unbind` Alexandre Courbot
2025-12-16 12:14   ` Danilo Krummrich
2025-12-16 14:38     ` Alexandre Courbot
2025-12-16  5:13 ` [PATCH 2/7] rust: add warn_on_err macro Alexandre Courbot
2025-12-16  5:13 ` [PATCH 3/7] gpu: nova-core: use " Alexandre Courbot
2025-12-16  5:13 ` [PATCH RFC 4/7] rust: pin-init: allow `dead_code` on projection structure Alexandre Courbot
2025-12-16  6:12   ` Benno Lossin
2025-12-16  5:13 ` [PATCH 5/7] gpu: nova-nova: use pin-init projections Alexandre Courbot
2025-12-16  5:13 ` [PATCH 6/7] gpu: nova-core: send UNLOADING_GUEST_DRIVER GSP command GSP upon unloading Alexandre Courbot
2025-12-16 15:39   ` Joel Fernandes [this message]
2025-12-18 13:27     ` Alexandre Courbot
2025-12-18 20:52       ` Joel Fernandes
2025-12-19  3:26         ` Alexandre Courbot
2025-12-19  6:42           ` Joel Fernandes
2025-12-18 22:33       ` Timur Tabi
2025-12-18 22:44         ` Joel Fernandes
2025-12-18 23:34           ` Timur Tabi
2025-12-19  1:46             ` Joel Fernandes
2025-12-19  1:48               ` Joel Fernandes
2025-12-19  3:39         ` Alexandre Courbot
2025-12-20 21:30           ` Timur Tabi
2026-01-14 14:02             ` Alexandre Courbot
2025-12-16  5:13 ` [PATCH 7/7] gpu: nova-core: run Booter Unloader and FWSEC-SB upon unbinding Alexandre Courbot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=C890CCBB-76C0-4E70-A7B8-846E34DABECE@nvidia.com \
    --to=joelagnelf@nvidia.com \
    --cc=a.hindborg@kernel.org \
    --cc=acourbot@nvidia.com \
    --cc=airlied@gmail.com \
    --cc=aliceryhl@google.com \
    --cc=apopple@nvidia.com \
    --cc=bhelgaas@google.com \
    --cc=bjorn3_gh@protonmail.com \
    --cc=boqun.feng@gmail.com \
    --cc=dakr@kernel.org \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=ecourtney@nvidia.com \
    --cc=epeer@nvidia.com \
    --cc=gary@garyguo.net \
    --cc=jhubbard@nvidia.com \
    --cc=kwilczynski@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=lossin@kernel.org \
    --cc=nouveau@lists.freedesktop.org \
    --cc=ojeda@kernel.org \
    --cc=rust-for-linux@vger.kernel.org \
    --cc=simona@ffwll.ch \
    --cc=tmgross@umich.edu \
    --cc=ttabi@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox