[PATCH v2 4/4] drm/tyr: add reset management API

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Onur Özkan" <work@onurozkan.dev>
To: dakr@kernel.org, aliceryhl@google.com,
	daniel.almeida@collabora.com, airlied@gmail.com, simona@ffwll.ch,
	dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org,
	rust-for-linux@vger.kernel.org
Cc: "Onur Özkan" <work@onurozkan.dev>
Subject: [PATCH v2 4/4] drm/tyr: add reset management API
Date: Thu, 16 Apr 2026 20:17:28 +0300	[thread overview]
Message-ID: <20260416171728.205141-3-work@onurozkan.dev> (raw)
In-Reply-To: <20260416171728.205141-1-work@onurozkan.dev>

Add Tyr reset handling on top of the Rust SRCU abstraction and use
a hardware gate to serialize reset-sensitive accesses against asynchronous
reset work.

This introduces `ResetHandle`, `HwGate`, and `HwGuard`, runs reset work
on a dedicated ordered workqueue and drains in-flight accesses before
running the reset sequence.

Signed-off-by: Onur Özkan <work@onurozkan.dev>
---
 drivers/gpu/drm/tyr/driver.rs        |  40 +---
 drivers/gpu/drm/tyr/reset.rs         | 293 +++++++++++++++++++++++++++
 drivers/gpu/drm/tyr/reset/hw_gate.rs | 155 ++++++++++++++
 drivers/gpu/drm/tyr/tyr.rs           |   1 +
 4 files changed, 459 insertions(+), 30 deletions(-)
 create mode 100644 drivers/gpu/drm/tyr/reset.rs
 create mode 100644 drivers/gpu/drm/tyr/reset/hw_gate.rs

diff --git a/drivers/gpu/drm/tyr/driver.rs b/drivers/gpu/drm/tyr/driver.rs
index 246bc3cb8580..178e48ccd434 100644
--- a/drivers/gpu/drm/tyr/driver.rs
+++ b/drivers/gpu/drm/tyr/driver.rs
@@ -6,11 +6,8 @@
         OptionalClk, //
     },
     device::{
-        Bound,
         Core,
-        Device, //
     },
-    devres::Devres,
     dma::{
         Device as DmaDevice,
         DmaMask, //
@@ -21,10 +18,6 @@
         ioctl,
         UnregisteredDevice, //
     },
-    io::{
-        poll,
-        Io, //
-    },
     new_mutex,
     of,
     platform,
@@ -37,17 +30,16 @@
         Arc,
         Mutex, //
     },
-    time, //
 };
 
 use crate::{
     file::TyrDrmFileData,
     fw::Firmware,
     gem::BoData,
-    gpu,
     gpu::GpuInfo,
     mmu::Mmu,
-    regs::gpu_control::*, //
+    regs::gpu_control::*,
+    reset, //
 };
 
 pub(crate) type IoMem = kernel::io::mem::IoMem<SZ_2M>;
@@ -62,6 +54,11 @@
 
 #[pin_data]
 pub(crate) struct TyrDrmDeviceData {
+    // `ResetHandle::drop()` drains queued/running works and this must happen
+    // before clocks/regulators are dropped. So keep this field before them to
+    // ensure the correct drop order.
+    pub(crate) reset: reset::ResetHandle,
+
     pub(crate) pdev: ARef<platform::Device>,
 
     pub(crate) fw: Arc<Firmware>,
@@ -90,24 +87,6 @@ unsafe impl Send for TyrDrmDeviceData {}
 // SAFETY: This will be removed in a future patch.
 unsafe impl Sync for TyrDrmDeviceData {}
 
-fn issue_soft_reset(dev: &Device<Bound>, iomem: &Devres<IoMem>) -> Result {
-    let io = (*iomem).access(dev)?;
-    io.write_reg(GPU_COMMAND::reset(ResetMode::SoftReset));
-
-    poll::read_poll_timeout(
-        || {
-            let io = (*iomem).access(dev)?;
-            Ok(io.read(GPU_IRQ_RAWSTAT))
-        },
-        |status| status.reset_completed(),
-        time::Delta::from_millis(1),
-        time::Delta::from_millis(100),
-    )
-    .inspect_err(|_| dev_err!(dev, "GPU reset failed."))?;
-
-    Ok(())
-}
-
 kernel::of_device_table!(
     OF_TABLE,
     MODULE_OF_TABLE,
@@ -140,8 +119,7 @@ fn probe(
         let request = pdev.io_request_by_index(0).ok_or(ENODEV)?;
         let iomem = Arc::pin_init(request.iomap_sized::<SZ_2M>(), GFP_KERNEL)?;
 
-        issue_soft_reset(pdev.as_ref(), &iomem)?;
-        gpu::l2_power_on(pdev.as_ref(), &iomem)?;
+        reset::run_reset(pdev.as_ref(), &iomem)?;
 
         let gpu_info = GpuInfo::new(pdev.as_ref(), &iomem)?;
         gpu_info.log(pdev.as_ref());
@@ -156,6 +134,7 @@ fn probe(
 
         let uninit_ddev = UnregisteredDevice::<TyrDrmDriver>::new(pdev.as_ref())?;
         let platform: ARef<platform::Device> = pdev.into();
+        let reset = reset::ResetHandle::new(platform.clone(), iomem.clone())?;
 
         let mmu = Mmu::new(pdev, iomem.as_arc_borrow(), &gpu_info)?;
 
@@ -181,6 +160,7 @@ fn probe(
                     _mali: mali_regulator,
                     _sram: sram_regulator,
                 }),
+                reset,
                 gpu_info,
         });
 
diff --git a/drivers/gpu/drm/tyr/reset.rs b/drivers/gpu/drm/tyr/reset.rs
new file mode 100644
index 000000000000..906051a1c667
--- /dev/null
+++ b/drivers/gpu/drm/tyr/reset.rs
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Provides asynchronous reset handling for the Tyr DRM driver via [`ResetHandle`]
+//! which runs reset work on a dedicated ordered workqueue and avoids duplicate
+//! pending resets.
+//!
+//! # High-level Execution Flow
+//!
+//! ```
+//!  User code                  Reset worker
+//!  ----------                 ------------
+//!  schedule()                 reset_work()
+//!    - Idle -> Pending          - Pending -> InProgress
+//!    - enqueue reset work       - synchronize() (wait for older accesses)
+//!                               - pre_reset() on reset managed hardware
+//!                               - run_reset()
+//!                               - post_reset() on reset managed hardware
+//!                               - epoch++, InProgress -> Idle
+//!                               - InProgress -> Idle
+//! ```
+
+mod hw_gate;
+
+use hw_gate::HwGate;
+
+use kernel::{
+    device::{
+        Bound,
+        Device, //
+    },
+    devres::Devres,
+    io::{
+        poll,
+        Io, //
+    },
+    platform,
+    prelude::*,
+    sync::{
+        aref::ARef,
+        atomic::AtomicType,
+        Arc, //
+    },
+    time,
+    workqueue::{
+        self,
+        OwnedQueue,
+        Queue,
+        Work, //
+    },
+};
+
+use crate::{
+    driver::IoMem,
+    gpu,
+    regs::gpu_control::*, //
+};
+
+/// Lifecycle state of the reset worker.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(i32)]
+enum ResetState {
+    /// No reset is pending or in progress.
+    Idle = 0,
+    /// A reset has been scheduled but has not started executing yet.
+    Pending = 1,
+    /// The reset worker is actively resetting the hardware.
+    InProgress = 2,
+}
+
+// SAFETY: `ResetState` and `i32` have the same size and alignment, and are
+// round-trip transmutable.
+unsafe impl AtomicType for ResetState {
+    type Repr = i32;
+}
+
+/// Trait for the reset-managed hardware.
+///
+/// [`ActiveHwState`] groups the hardware blocks that implement this trait
+/// and defines their pre-reset and post-reset hook sequence.
+///
+/// Once reset scheduling flips the gate out of [`ResetState::Idle`], the reset
+/// worker first drains any pre-existing SRCU readers before running pre_reset()
+// and post_reset() hooks.
+///
+/// `pre_reset()` is infallible and returning `Err` from `post_reset()` is treated
+/// as a reset-cycle failure.
+pub(crate) trait Resettable: Send + Sync {
+    /// Called before the reset sequence starts and the hardware is reset.
+    ///
+    /// Before this is called, the reset worker waits for all pre-existing
+    /// hardware accesses to complete.
+    fn pre_reset(&self);
+
+    /// Called after the hardware reset completes.
+    ///
+    /// `reset_failed` is `true` if an earlier stage in the current reset cycle
+    /// has already failed. Returning `Err` fails the entire cycle.
+    fn post_reset(&self, reset_failed: bool) -> Result;
+}
+
+/// Reset-managed hardware state coordinated by [`HwGate`].
+///
+/// Groups the driver components that must quiesce before a GPU reset and resume
+/// afterwards. The [`Resettable`] implementation defines the pre-reset and post-reset
+/// hook sequence for those components.
+struct ActiveHwState {
+    // mmu: Arc<Mmu>,
+}
+
+impl Resettable for ActiveHwState {
+    fn pre_reset(&self) {
+        // self.mmu.pre_reset();
+    }
+
+    fn post_reset(&self, _reset_failed: bool) -> Result {
+        // self.mmu.post_reset()?;
+        Ok(())
+    }
+}
+
+/// Internal reset orchestrator that owns the gate and work item.
+#[pin_data]
+struct Controller {
+    /// Platform device reference needed for reset operations and logging.
+    pdev: ARef<platform::Device>,
+    /// Mapped register space needed for reset operations.
+    iomem: Arc<Devres<IoMem>>,
+    /// Access gate for reset managed hardware users.
+    #[pin]
+    hw: HwGate<ActiveHwState>,
+    /// Work item backing async reset processing.
+    #[pin]
+    work: Work<Controller>,
+}
+
+kernel::impl_has_work! {
+    impl HasWork<Controller> for Controller { self.work }
+}
+
+impl workqueue::WorkItem for Controller {
+    type Pointer = Arc<Self>;
+
+    fn run(this: Arc<Self>) {
+        this.reset_work();
+    }
+}
+
+impl Controller {
+    /// Creates an [`Arc<Controller>`] ready for use.
+    fn new(pdev: ARef<platform::Device>, iomem: Arc<Devres<IoMem>>) -> Result<Arc<Self>> {
+        Arc::pin_init(
+            try_pin_init!(Self {
+                pdev,
+                iomem,
+                hw <- HwGate::new(ActiveHwState {}),
+                work <- kernel::new_work!("tyr::reset"),
+            }),
+            GFP_KERNEL,
+        )
+    }
+
+    /// Processes one scheduled reset request.
+    ///
+    /// If the pending reset cannot be claimed, the worker returns immediately.
+    ///
+    /// It first claims [`ResetState::Pending`], then waits for earlier hardware
+    /// accesses to complete before running the pre-reset hook. After that it issues
+    /// the hardware reset, runs the post-reset hooks and finally returns the gate to
+    /// the [`ResetState::Idle`] state.
+    ///
+    /// Panthor reference:
+    /// - drivers/gpu/drm/panthor/panthor_device.c::panthor_device_reset_work()
+    fn reset_work(self: &Arc<Self>) {
+        if !self.hw.start_reset() {
+            // Another reset is already pending or in progress, so we skip this one.
+            return;
+        }
+
+        dev_info!(self.pdev.as_ref(), "Starting GPU reset.\n");
+
+        // Wait for all hardware accesses that started before reset became
+        // visible to finish before running the reset callbacks.
+        self.hw.synchronize();
+
+        self.hw.pre_reset();
+
+        // SAFETY: `Controller` is part of driver-private data and only exists
+        // while the platform device is bound.
+        let pdev = unsafe { self.pdev.as_ref().as_bound() };
+
+        let mut reset_failed = false;
+        if let Err(e) = run_reset(pdev, &self.iomem) {
+            reset_failed = true;
+            dev_err!(self.pdev.as_ref(), "GPU reset failed: {:?}\n", e);
+        }
+
+        if let Err(_e) = self.hw.post_reset(reset_failed) {
+            // TODO: Unplug the GPU.
+            // There is no API for unplugging the GPU and this is unreachable
+            // for now since there are no hardware users for reset API.
+        }
+
+        if reset_failed {
+            dev_err!(self.pdev.as_ref(), "GPU reset cycle failed.\n");
+        } else {
+            dev_info!(self.pdev.as_ref(), "GPU reset completed.\n");
+        }
+
+        self.hw.finish_reset();
+    }
+}
+
+/// User-facing handle for scheduling resets.
+///
+/// Dropping the handle drains any queued or in-flight reset work to ensure a
+/// clean teardown before clocks and regulators are released.
+pub(crate) struct ResetHandle {
+    controller: Arc<Controller>,
+    wq: OwnedQueue,
+}
+
+impl ResetHandle {
+    /// Creates [`ResetHandle`].
+    pub(crate) fn new(pdev: ARef<platform::Device>, iomem: Arc<Devres<IoMem>>) -> Result<Self> {
+        Ok(Self {
+            controller: Controller::new(pdev, iomem)?,
+            wq: Queue::new_ordered().build(c"tyr-reset-wq")?,
+        })
+    }
+
+    /// Schedules a GPU reset on the dedicated workqueue.
+    ///
+    /// If a reset is already pending or in progress the call is a no-op.
+    #[expect(dead_code)]
+    pub(crate) fn schedule(&self) {
+        // TODO: Similar to `panthor_device_schedule_reset()` in Panthor, add a
+        // power management check once Tyr supports it.
+
+        // Keep only one reset request running or queued. If one is already pending,
+        // we ignore new schedule requests.
+        if self.controller.hw.begin_reset() && self.wq.enqueue(self.controller.clone()).is_err() {
+            self.controller.hw.cancel_reset();
+        }
+    }
+}
+
+impl Drop for ResetHandle {
+    fn drop(&mut self) {
+        // Drain queued/running work and block future queueing attempts for this
+        // work item before clocks/regulators are cleaned up.
+        self.controller.work.disable_sync();
+    }
+}
+
+/// Issues a soft reset command and waits for reset-complete IRQ status.
+fn issue_soft_reset(dev: &Device<Bound>, iomem: &Devres<IoMem>) -> Result {
+    let io = (*iomem).access(dev)?;
+
+    // Clear any stale reset-complete IRQ state before issuing a new soft reset.
+    io.write_reg(GPU_IRQ_CLEAR::zeroed().with_reset_completed(true));
+
+    io.write_reg(GPU_COMMAND::reset(ResetMode::SoftReset));
+
+    poll::read_poll_timeout(
+        || {
+            let io = (*iomem).access(dev)?;
+            Ok(io.read(GPU_IRQ_RAWSTAT))
+        },
+        |status| status.reset_completed(),
+        time::Delta::from_millis(1),
+        time::Delta::from_millis(100),
+    )
+    .inspect_err(|_| dev_err!(dev, "GPU reset timed out."))?;
+
+    Ok(())
+}
+
+/// Runs one synchronous GPU reset pass.
+///
+/// Its visibility is `pub(super)` only so the probe path can run an
+/// initial reset; it is not part of this module's public API.
+///
+/// On success, the GPU is left in a state suitable for reinitialization.
+///
+/// The sequence is as follows:
+///   - Trigger a GPU soft reset.
+///   - Wait for the reset-complete IRQ status.
+///   - Power L2 back on.
+pub(super) fn run_reset(dev: &Device<Bound>, iomem: &Devres<IoMem>) -> Result {
+    issue_soft_reset(dev, iomem)?;
+    gpu::l2_power_on(dev, iomem)?;
+    Ok(())
+}
diff --git a/drivers/gpu/drm/tyr/reset/hw_gate.rs b/drivers/gpu/drm/tyr/reset/hw_gate.rs
new file mode 100644
index 000000000000..ff304ca127f3
--- /dev/null
+++ b/drivers/gpu/drm/tyr/reset/hw_gate.rs
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! SRCU based hardware access gate.
+//!
+//! This module provides [`HwGate`] which is a generic, SRCU based gate
+//! that serialises hardware access against asynchronous reset cycles.
+
+use super::{
+    ResetState,
+    Resettable, //
+};
+
+use kernel::{
+    prelude::*,
+    sync::{
+        atomic::{
+            Atomic,
+            Relaxed, //
+        },
+        srcu, Srcu,
+    },
+};
+
+use core::ops::Deref;
+
+/// A gate that coordinates hardware access with asynchronous resets.
+#[pin_data]
+pub(crate) struct HwGate<T: Resettable> {
+    #[pin]
+    srcu: Srcu,
+    state: Atomic<ResetState>,
+    epoch: Atomic<u64>,
+    hw: T,
+}
+
+impl<T: Resettable> HwGate<T> {
+    /// Creates a new gate for the given `hw` in [`ResetState::Idle`] state.
+    pub(super) fn new(hw: T) -> impl PinInit<Self, Error> {
+        try_pin_init!(Self {
+            srcu <- kernel::new_srcu!(),
+            state: Atomic::new(ResetState::Idle),
+            epoch: Atomic::new(0),
+            hw,
+        })
+    }
+
+    /// Tries to acquire the hardware access guard.
+    ///
+    /// Returns [`EBUSY`] if a reset is pending or in progress.
+    pub(crate) fn try_access(&self) -> Result<HwGuard<'_, T>> {
+        let srcu = self.srcu.read_lock();
+
+        if self.state.load(Relaxed) != ResetState::Idle {
+            return Err(EBUSY);
+        }
+
+        let epoch = self.epoch.load(Relaxed);
+
+        Ok(HwGuard {
+            hw: &self.hw,
+            epoch,
+            _srcu: srcu,
+        })
+    }
+
+    /// Runs `f` with [`HwGuard`], failing fast with [`EBUSY`] if a reset is
+    /// pending or in progress.
+    #[expect(dead_code)]
+    pub(crate) fn with_hw<R>(&self, f: impl FnOnce(&HwGuard<'_, T>) -> Result<R>) -> Result<R> {
+        let guard = self.try_access()?;
+        f(&guard)
+    }
+
+    /// Transitions from [`ResetState::Idle`] to [`ResetState::Pending`].
+    ///
+    /// Returns `true` if the transition succeeded (i.e. no reset was already
+    /// scheduled).
+    pub(super) fn begin_reset(&self) -> bool {
+        self.state
+            .cmpxchg(ResetState::Idle, ResetState::Pending, Relaxed)
+            .is_ok()
+    }
+
+    /// Transitions from [`ResetState::Pending`] to [`ResetState::InProgress`].
+    ///
+    /// Returns `true` if the transition succeeded.
+    pub(super) fn start_reset(&self) -> bool {
+        self.state
+            .cmpxchg(ResetState::Pending, ResetState::InProgress, Relaxed)
+            .is_ok()
+    }
+
+    /// Transitions from [`ResetState::InProgress`] to [`ResetState::Idle`]
+    /// and bumps the epoch.
+    pub(super) fn finish_reset(&self) {
+        self.epoch.fetch_add(1, Relaxed);
+        self.state.store(ResetState::Idle, Relaxed);
+    }
+
+    /// Transitions from [`ResetState::Pending`] to [`ResetState::Idle`].
+    pub(super) fn cancel_reset(&self) {
+        self.state.store(ResetState::Idle, Relaxed);
+    }
+
+    /// Waits for all pre-existing SRCU readers to complete.
+    ///
+    /// This must only be called from the reset worker after the state has left
+    /// [`ResetState::Idle`], so that no new readers can enter.
+    pub(super) fn synchronize(&self) {
+        self.srcu.synchronize();
+    }
+}
+
+impl<T: Resettable> Resettable for HwGate<T> {
+    fn pre_reset(&self) {
+        self.hw.pre_reset()
+    }
+
+    fn post_reset(&self, reset_failed: bool) -> Result {
+        self.hw.post_reset(reset_failed)
+    }
+}
+
+/// A hardware guard that is only present when the hardware is accessible.
+///
+/// Holding a [`HwGuard`] means the hardware is still in use and prevents
+/// the reset path from proceeding. The reset worker waits for all active
+/// guards to be dropped before it continues with the reset.
+#[must_use = "the hardware guard must be kept alive while using reset-sensitive state"]
+pub(crate) struct HwGuard<'a, T> {
+    hw: &'a T,
+    epoch: u64,
+    _srcu: srcu::Guard<'a>,
+}
+
+impl<T> HwGuard<'_, T> {
+    /// Returns the epoch at which this guard was acquired.
+    ///
+    /// This is a snapshot of [`HwGate`]'s epoch counter taken when the guard
+    /// was acquired. The gate increments that counter each time a reset cycle
+    /// completes. Callers can compare epochs from separate access windows to
+    /// detect whether a reset happened in between.
+    #[expect(dead_code)]
+    pub(crate) fn epoch(&self) -> u64 {
+        self.epoch
+    }
+}
+
+impl<T> Deref for HwGuard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.hw
+    }
+}
diff --git a/drivers/gpu/drm/tyr/tyr.rs b/drivers/gpu/drm/tyr/tyr.rs
index 18b0668bb217..d0349bc49f27 100644
--- a/drivers/gpu/drm/tyr/tyr.rs
+++ b/drivers/gpu/drm/tyr/tyr.rs
@@ -14,6 +14,7 @@
 mod gpu;
 mod mmu;
 mod regs;
+mod reset;
 mod slot;
 mod vm;
 
-- 
2.51.2

next prev parent reply	other threads:[~2026-04-16 17:17 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 17:17 [PATCH v2 0/4] drm/tyr: implement GPU reset API Onur Özkan
2026-04-16 17:17 ` [PATCH v2 3/4] rust: add Work::disable_sync Onur Özkan
2026-04-16 17:17 ` Onur Özkan [this message]
2026-04-16 17:23 ` [PATCH v2 0/4] drm/tyr: implement GPU reset API Onur Özkan
2026-04-16 18:45   ` Boqun Feng
2026-04-17  8:02     ` Onur Özkan
2026-04-28 10:49     ` Onur Özkan
2026-04-16 17:43 ` [PATCH v2 RESEND 1/4] rust: add SRCU abstraction Onur Özkan
2026-04-16 17:43   ` [PATCH v2 RESEND 2/4] MAINTAINERS: add Rust SRCU files to SRCU entry Onur Özkan
2026-04-21 16:14   ` [PATCH v2 RESEND 1/4] rust: add SRCU abstraction Gary Guo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:246bc3cb858 dfblob:178e48ccd43 dfblob:906051a1c66
dfblob:ff304ca127f dfblob:18b0668bb21 dfblob:d0349bc49f2 )
 OR (
bs:"[PATCH v2 4/4] drm/tyr: add reset management API" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260416171728.205141-3-work@onurozkan.dev \
    --to=work@onurozkan.dev \
    --cc=airlied@gmail.com \
    --cc=aliceryhl@google.com \
    --cc=dakr@kernel.org \
    --cc=daniel.almeida@collabora.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rust-for-linux@vger.kernel.org \
    --cc=simona@ffwll.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.