[PATCH v2 0/6] drm/panthor: Misc fixes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/6] drm/panthor: Misc fixes
@ 2025-11-13 10:39 Boris Brezillon
  2025-11-13 10:39 ` [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
                   ` (5 more replies)
  0 siblings, 6 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Hello,

This is a set of fixes for bugs I ran into while looking at [1].
Hopefully that's enough to recover from AS_ACTIVE bit stuck
situations, but it'd be good to understand why the MMU block is
completely blocked in some cases and try to come up with better
mitigations than a full GPU reset.

Check each patch for a detailed changelog.

Regards,

Boris

Boris Brezillon (6):
  drm/panthor: Always wait after sending a command to an AS
  drm/panthor: Kill lock_region()
  drm/panthor: Recover from panthor_gpu_flush_caches() failures
  drm/panthor: Add support for atomic page table updates
  drm/panthor: Make panthor_vm_[un]map_pages() more robust
  drm/panthor: Relax a check in panthor_sched_pre_reset()

 drivers/gpu/drm/panthor/panthor_gpu.c   |  19 +-
 drivers/gpu/drm/panthor/panthor_mmu.c   | 277 +++++++++++++-----------
 drivers/gpu/drm/panthor/panthor_sched.c |   2 -
 3 files changed, 164 insertions(+), 134 deletions(-)

-- 
2.51.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 12:29   ` Steven Price
  2025-11-13 10:39 ` [PATCH v2 2/6] drm/panthor: Kill lock_region() Boris Brezillon
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

There's currently no situation where we want to issue a command to an
AS and not wait for this command to complete. The wait is either
explicitly done (LOCK, UNLOCK) or it's missing (UPDATE). So let's
turn write_cmd() into as_send_cmd_and_wait() that has the wait after
a command is sent.

v2:
- New patch

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 6a41dfd7aaf3..186048fc2c25 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -524,27 +524,29 @@ static int wait_ready(struct panthor_device *ptdev, u32 as_nr)
 	return ret;
 }
 
-static int write_cmd(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
+static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
 {
 	int status;
 
 	/* write AS_COMMAND when MMU is ready to accept another command */
 	status = wait_ready(ptdev, as_nr);
-	if (!status)
+	if (!status) {
 		gpu_write(ptdev, AS_COMMAND(as_nr), cmd);
+		status = wait_ready(ptdev, as_nr);
+	}
 
 	return status;
 }
 
-static void lock_region(struct panthor_device *ptdev, u32 as_nr,
-			u64 region_start, u64 size)
+static int lock_region(struct panthor_device *ptdev, u32 as_nr,
+		       u64 region_start, u64 size)
 {
 	u8 region_width;
 	u64 region;
 	u64 region_end = region_start + size;
 
 	if (!size)
-		return;
+		return 0;
 
 	/*
 	 * The locked region is a naturally aligned power of 2 block encoded as
@@ -567,7 +569,7 @@ static void lock_region(struct panthor_device *ptdev, u32 as_nr,
 
 	/* Lock the region that needs to be updated */
 	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
-	write_cmd(ptdev, as_nr, AS_COMMAND_LOCK);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
 }
 
 static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
@@ -600,9 +602,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * power it up
 	 */
 
-	lock_region(ptdev, as_nr, iova, size);
-
-	ret = wait_ready(ptdev, as_nr);
+	ret = lock_region(ptdev, as_nr, iova, size);
 	if (ret)
 		return ret;
 
@@ -615,10 +615,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * at the end of the GPU_CONTROL cache flush command, unlike
 	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
 	 */
-	write_cmd(ptdev, as_nr, AS_COMMAND_UNLOCK);
-
-	/* Wait for the unlock command to complete */
-	return wait_ready(ptdev, as_nr);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
 }
 
 static int mmu_hw_do_operation(struct panthor_vm *vm,
@@ -647,7 +644,7 @@ static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
 
-	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
 }
 
 static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
@@ -662,7 +659,7 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), 0);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), AS_TRANSCFG_ADRMODE_UNMAPPED);
 
-	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
 }
 
 static u32 panthor_mmu_fault_mask(struct panthor_device *ptdev, u32 value)
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/6] drm/panthor: Kill lock_region()
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
  2025-11-13 10:39 ` [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 12:44   ` Steven Price
  2025-11-13 10:39 ` [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

The meat in lock_region() is about packing a region range into a
single u64. The rest is just a regular reg write plus a
as_send_cmd_and_wait() call that can easily be inlined in
mmu_hw_do_operation_locked().

v2:
- New patch

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 186048fc2c25..f109c1588186 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -538,11 +538,9 @@ static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd
 	return status;
 }
 
-static int lock_region(struct panthor_device *ptdev, u32 as_nr,
-		       u64 region_start, u64 size)
+static u64 pack_region_range(u64 region_start, u64 size)
 {
 	u8 region_width;
-	u64 region;
 	u64 region_end = region_start + size;
 
 	if (!size)
@@ -565,11 +563,7 @@ static int lock_region(struct panthor_device *ptdev, u32 as_nr,
 	 */
 	region_start &= GENMASK_ULL(63, region_width);
 
-	region = region_width | region_start;
-
-	/* Lock the region that needs to be updated */
-	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
-	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
+	return region_width | region_start;
 }
 
 static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
@@ -602,7 +596,9 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * power it up
 	 */
 
-	ret = lock_region(ptdev, as_nr, iova, size);
+	/* Lock the region that needs to be updated */
+	gpu_write64(ptdev, AS_LOCKADDR(as_nr), pack_region_range(iova, size));
+	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
 	if (ret)
 		return ret;
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
  2025-11-13 10:39 ` [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
  2025-11-13 10:39 ` [PATCH v2 2/6] drm/panthor: Kill lock_region() Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 14:02   ` Steven Price
  2025-11-13 10:39 ` [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

We have seen a few cases where the whole memory subsystem is blocked
and flush operations never complete. When that happens, we want to:

- schedule a reset, so we can recover from this situation
- in the reset path, we need to reset the pending_reqs so we can send
  new commands after the reset
- if more panthor_gpu_flush_caches() operations are queued after
  the timeout, we skip them and return -EIO directly to avoid needless
  waits (the memory block won't miraculously work again)

v2:
- New patch

Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
index eda670229184..abd2fde04da9 100644
--- a/drivers/gpu/drm/panthor/panthor_gpu.c
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -283,38 +283,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
 int panthor_gpu_flush_caches(struct panthor_device *ptdev,
 			     u32 l2, u32 lsc, u32 other)
 {
-	bool timedout = false;
 	unsigned long flags;
+	int ret = 0;
 
 	/* Serialize cache flush operations. */
 	guard(mutex)(&ptdev->gpu->cache_flush_lock);
 
 	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
-	if (!drm_WARN_ON(&ptdev->base,
-			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
+	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
 		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
+	} else {
+		ret = -EIO;
 	}
 	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 
+	if (ret)
+		return ret;
+
 	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
 				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
 				msecs_to_jiffies(100))) {
 		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
 		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
 		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
-			timedout = true;
+			ret = -ETIMEDOUT;
 		else
 			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 	}
 
-	if (timedout) {
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
 		drm_err(&ptdev->base, "Flush caches timeout");
-		return -ETIMEDOUT;
 	}
 
-	return 0;
+	return ret;
 }
 
 /**
@@ -354,6 +358,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
 		return -ETIMEDOUT;
 	}
 
+	ptdev->gpu->pending_reqs = 0;
 	return 0;
 }
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (2 preceding siblings ...)
  2025-11-13 10:39 ` [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 14:26   ` Steven Price
  2025-11-13 10:39 ` [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
  2025-11-13 10:39 ` [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Move the lock/flush_mem operations around the gpuvm_sm_[un]map() calls
so we can implement true atomic page updates, where any access in the
locked range done by the GPU has to wait for the page table updates
to land before proceeding.

This is needed for vkQueueBindSparse(), so we can replace the dummy
page mapped over the entire object by actual BO backed pages in an atomic
way. But it's also useful to avoid "AS_ACTIVE bit stuck" failures in
the sm_[un]map() path, leading to gpuvm state inconsistencies.

v2:
- Adjust to match the two new preliminary patches

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 184 +++++++++++++-------------
 1 file changed, 95 insertions(+), 89 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index f109c1588186..21389137a324 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -389,6 +389,15 @@ struct panthor_vm {
 	 * flagged as faulty as a result.
 	 */
 	bool unhandled_fault;
+
+	/** @locked_region: Information about the currently locked region currently. */
+	struct {
+		/** @locked_region.start: Start of the locked region. */
+		u64 start;
+
+		/** @locked_region.size: Size of the locked region. */
+		u64 size;
+	} locked_region;
 };
 
 /**
@@ -566,76 +575,9 @@ static u64 pack_region_range(u64 region_start, u64 size)
 	return region_width | region_start;
 }
 
-static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
-				      u64 iova, u64 size, u32 op)
-{
-	const u32 l2_flush_op = CACHE_CLEAN | CACHE_INV;
-	u32 lsc_flush_op;
-	int ret;
-
-	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
-
-	switch (op) {
-	case AS_COMMAND_FLUSH_MEM:
-		lsc_flush_op = CACHE_CLEAN | CACHE_INV;
-		break;
-	case AS_COMMAND_FLUSH_PT:
-		lsc_flush_op = 0;
-		break;
-	default:
-		drm_WARN(&ptdev->base, 1, "Unexpected AS_COMMAND: %d", op);
-		return -EINVAL;
-	}
-
-	if (as_nr < 0)
-		return 0;
-
-	/*
-	 * If the AS number is greater than zero, then we can be sure
-	 * the device is up and running, so we don't need to explicitly
-	 * power it up
-	 */
-
-	/* Lock the region that needs to be updated */
-	gpu_write64(ptdev, AS_LOCKADDR(as_nr), pack_region_range(iova, size));
-	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
-	if (ret)
-		return ret;
-
-	ret = panthor_gpu_flush_caches(ptdev, l2_flush_op, lsc_flush_op, 0);
-	if (ret)
-		return ret;
-
-	/*
-	 * Explicitly unlock the region as the AS is not unlocked automatically
-	 * at the end of the GPU_CONTROL cache flush command, unlike
-	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
-	 */
-	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
-}
-
-static int mmu_hw_do_operation(struct panthor_vm *vm,
-			       u64 iova, u64 size, u32 op)
-{
-	struct panthor_device *ptdev = vm->ptdev;
-	int ret;
-
-	mutex_lock(&ptdev->mmu->as.slots_lock);
-	ret = mmu_hw_do_operation_locked(ptdev, vm->as.id, iova, size, op);
-	mutex_unlock(&ptdev->mmu->as.slots_lock);
-
-	return ret;
-}
-
 static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
 				 u64 transtab, u64 transcfg, u64 memattr)
 {
-	int ret;
-
-	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
-	if (ret)
-		return ret;
-
 	gpu_write64(ptdev, AS_TRANSTAB(as_nr), transtab);
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
@@ -647,7 +589,9 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
 {
 	int ret;
 
-	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
+	/* Flush+invalidate RW caches, invalidate RO ones. */
+	ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
+				       CACHE_CLEAN | CACHE_INV, CACHE_INV);
 	if (ret)
 		return ret;
 
@@ -729,6 +673,10 @@ int panthor_vm_active(struct panthor_vm *vm)
 	if (refcount_inc_not_zero(&vm->as.active_cnt))
 		goto out_dev_exit;
 
+	/* Make sure we don't race with lock/unlock_region() calls
+	 * happening around VM bind operations.
+	 */
+	mutex_lock(&vm->op_lock);
 	mutex_lock(&ptdev->mmu->as.slots_lock);
 
 	if (refcount_inc_not_zero(&vm->as.active_cnt))
@@ -796,6 +744,10 @@ int panthor_vm_active(struct panthor_vm *vm)
 		gpu_write(ptdev, MMU_INT_MASK, ~ptdev->mmu->as.faulty_mask);
 	}
 
+	/* The VM update is guarded by ::op_lock, which we take at the beginning
+	 * of this function, so we don't expect any locked region here.
+	 */
+	drm_WARN_ON(&vm->ptdev->base, vm->locked_region.size > 0);
 	ret = panthor_mmu_as_enable(vm->ptdev, vm->as.id, transtab, transcfg, vm->memattr);
 
 out_make_active:
@@ -806,6 +758,7 @@ int panthor_vm_active(struct panthor_vm *vm)
 
 out_unlock:
 	mutex_unlock(&ptdev->mmu->as.slots_lock);
+	mutex_unlock(&vm->op_lock);
 
 out_dev_exit:
 	drm_dev_exit(cookie);
@@ -889,30 +842,15 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
 	return SZ_2M;
 }
 
-static int panthor_vm_flush_range(struct panthor_vm *vm, u64 iova, u64 size)
-{
-	struct panthor_device *ptdev = vm->ptdev;
-	int ret = 0, cookie;
-
-	if (vm->as.id < 0)
-		return 0;
-
-	/* If the device is unplugged, we just silently skip the flush. */
-	if (!drm_dev_enter(&ptdev->base, &cookie))
-		return 0;
-
-	ret = mmu_hw_do_operation(vm, iova, size, AS_COMMAND_FLUSH_PT);
-
-	drm_dev_exit(cookie);
-	return ret;
-}
-
 static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 {
 	struct panthor_device *ptdev = vm->ptdev;
 	struct io_pgtable_ops *ops = vm->pgtbl_ops;
 	u64 offset = 0;
 
+	drm_WARN_ON(&ptdev->base,
+		    (iova < vm->locked_region.start) ||
+		    (iova + size > vm->locked_region.start + vm->locked_region.size));
 	drm_dbg(&ptdev->base, "unmap: as=%d, iova=%llx, len=%llx", vm->as.id, iova, size);
 
 	while (offset < size) {
@@ -926,13 +864,12 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 				iova + offset + unmapped_sz,
 				iova + offset + pgsize * pgcount,
 				iova, iova + size);
-			panthor_vm_flush_range(vm, iova, offset + unmapped_sz);
 			return  -EINVAL;
 		}
 		offset += unmapped_sz;
 	}
 
-	return panthor_vm_flush_range(vm, iova, size);
+	return 0;
 }
 
 static int
@@ -949,6 +886,10 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 	if (!size)
 		return 0;
 
+	drm_WARN_ON(&ptdev->base,
+		    (iova < vm->locked_region.start) ||
+		    (iova + size > vm->locked_region.start + vm->locked_region.size));
+
 	for_each_sgtable_dma_sg(sgt, sgl, count) {
 		dma_addr_t paddr = sg_dma_address(sgl);
 		size_t len = sg_dma_len(sgl);
@@ -996,7 +937,7 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 		offset = 0;
 	}
 
-	return panthor_vm_flush_range(vm, start_iova, iova - start_iova);
+	return 0;
 }
 
 static int flags_to_prot(u32 flags)
@@ -1679,6 +1620,61 @@ static const char *access_type_name(struct panthor_device *ptdev,
 	}
 }
 
+static int panthor_vm_lock_region(struct panthor_vm *vm, u64 start, u64 size)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int ret = 0;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	drm_WARN_ON(&ptdev->base, vm->locked_region.start || vm->locked_region.size);
+	if (vm->as.id >= 0 && size) {
+		/* Lock the region that needs to be updated */
+		gpu_write64(ptdev, AS_LOCKADDR(vm->as.id), pack_region_range(start, size));
+
+		/* If the lock succeeded, update the locked_region info. */
+		ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_LOCK);
+	}
+
+	if (!ret) {
+		vm->locked_region.start = start;
+		vm->locked_region.size = size;
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	return ret;
+}
+
+static void panthor_vm_unlock_region(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	if (vm->as.id >= 0) {
+		int ret;
+
+		/* flush+invalidate RW caches and invalidate RO ones.
+		 * TODO: See if we can use FLUSH_PA_RANGE when the physical
+		 * range is narrow enough and the HW supports it.
+		 */
+		ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
+					       CACHE_CLEAN | CACHE_INV,
+					       CACHE_INV);
+
+		/* Unlock the region if the flush is effective. */
+		if (!ret)
+			ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_UNLOCK);
+
+		/* If we fail to flush or unlock the region, schedule a GPU reset
+		 * to unblock the situation.
+		 */
+		if (ret)
+			panthor_device_schedule_reset(ptdev);
+	}
+	vm->locked_region.start = 0;
+	vm->locked_region.size = 0;
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
 static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
 {
 	bool has_unhandled_faults = false;
@@ -1883,6 +1879,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
 	drm_sched_entity_destroy(&vm->entity);
 	drm_sched_fini(&vm->sched);
 
+	mutex_lock(&vm->op_lock);
 	mutex_lock(&ptdev->mmu->as.slots_lock);
 	if (vm->as.id >= 0) {
 		int cookie;
@@ -1897,6 +1894,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
 		list_del(&vm->as.lru_node);
 	}
 	mutex_unlock(&ptdev->mmu->as.slots_lock);
+	mutex_unlock(&vm->op_lock);
 
 	free_io_pgtable_ops(vm->pgtbl_ops);
 
@@ -2206,6 +2204,11 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 
 	mutex_lock(&vm->op_lock);
 	vm->op_ctx = op;
+
+	ret = panthor_vm_lock_region(vm, op->va.addr, op->va.range);
+	if (ret)
+		goto out;
+
 	switch (op_type) {
 	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: {
 		const struct drm_gpuvm_map_req map_req = {
@@ -2233,6 +2236,9 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 		break;
 	}
 
+	panthor_vm_unlock_region(vm);
+
+out:
 	if (ret && flag_vm_unusable_on_failure)
 		vm->unusable = true;
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (3 preceding siblings ...)
  2025-11-13 10:39 ` [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 15:07   ` Steven Price
  2025-11-13 10:39 ` [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

There's no reason for panthor_vm_[un]map_pages() to fail unless the
drm_gpuvm state and the page table are out of sync, so let's reflect that
by making panthor_vm_unmap_pages() a void function and adding
WARN_ON()s in various places. We also try to recover from those
unexpected mismatch by checking for already unmapped ranges and skipping
them. But there's only so much we can do to try and cope with such
SW bugs, so when we see a mismatch, we flag the VM unusable and disable
the AS to avoid further GPU accesses to the memory.

It could be that the as_disable() call fails because the MMU unit is
stuck, in which case the whole GPU is frozen, and only a GPU reset can
unblock things. Ater the reset, the VM will be seen as unusable and
any attempt to re-use it will fail, so we should be covered for any
use-after-unmap issues.

v2:
- Fix double unlock

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 82 ++++++++++++++++++---------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 21389137a324..35aad1e0ecaa 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -842,12 +842,32 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
 	return SZ_2M;
 }
 
-static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
+static void panthor_vm_declare_unusable(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int cookie;
+
+	if (vm->unusable)
+		return;
+
+	vm->unusable = true;
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	if (vm->as.id >= 0 && drm_dev_enter(&ptdev->base, &cookie)) {
+		panthor_mmu_as_disable(ptdev, vm->as.id);
+		drm_dev_exit(cookie);
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
+static void panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 {
 	struct panthor_device *ptdev = vm->ptdev;
 	struct io_pgtable_ops *ops = vm->pgtbl_ops;
 	u64 offset = 0;
 
+	if (!size)
+		return;
+
 	drm_WARN_ON(&ptdev->base,
 		    (iova < vm->locked_region.start) ||
 		    (iova + size > vm->locked_region.start + vm->locked_region.size));
@@ -858,18 +878,32 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 		size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount);
 
 		unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, pgcount, NULL);
+		if (drm_WARN_ON_ONCE(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
+			/* Gracefully handle sparsely unmapped regions to avoid leaving
+			 * page table pages behind when the drm_gpuvm and VM page table
+			 * are out-of-sync. This is not supposed to happen, hence the
+			 * above WARN_ON().
+			 */
+			while (!ops->iova_to_phys(ops, iova + unmapped_sz) &&
+			       unmapped_sz < pgsize * pgcount)
+				unmapped_sz += SZ_4K;
 
-		if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
-			drm_err(&ptdev->base, "failed to unmap range %llx-%llx (requested range %llx-%llx)\n",
-				iova + offset + unmapped_sz,
-				iova + offset + pgsize * pgcount,
-				iova, iova + size);
-			return  -EINVAL;
+			/* We're passed the point where we can try to fix things,
+			 * so flag the VM unusable to make sure it's not going
+			 * to be used anymore.
+			 */
+			panthor_vm_declare_unusable(vm);
+
+			/* If we don't make progress, we're screwed. That also means
+			 * something else prevents us from unmapping the region, but
+			 * there's not much we can do here: time for debugging.
+			 */
+			if (drm_WARN_ON_ONCE(&ptdev->base, !unmapped_sz))
+				return;
 		}
+
 		offset += unmapped_sz;
 	}
-
-	return 0;
 }
 
 static int
@@ -917,16 +951,17 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 			paddr += mapped;
 			len -= mapped;
 
-			if (drm_WARN_ON(&ptdev->base, !ret && !mapped))
+			/* If nothing was mapped, consider it an ENOMEM. */
+			if (!ret && !mapped)
 				ret = -ENOMEM;
 
-			if (ret) {
-				/* If something failed, unmap what we've already mapped before
-				 * returning. The unmap call is not supposed to fail.
+			/* If something fails, we stop there, and flag the VM unusable. */
+			if (drm_WARN_ON_ONCE(&ptdev->base, ret)) {
+				/* Unmap what we've already mapped to avoid leaving page
+				 * table pages behind.
 				 */
-				drm_WARN_ON(&ptdev->base,
-					    panthor_vm_unmap_pages(vm, start_iova,
-								   iova - start_iova));
+				panthor_vm_unmap_pages(vm, start_iova, iova - start_iova);
+				panthor_vm_declare_unusable(vm);
 				return ret;
 			}
 		}
@@ -2109,12 +2144,9 @@ static int panthor_gpuva_sm_step_remap(struct drm_gpuva_op *op,
 	struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
 	struct panthor_vma *prev_vma = NULL, *next_vma = NULL;
 	u64 unmap_start, unmap_range;
-	int ret;
 
 	drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, &unmap_range);
-	ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
-	if (ret)
-		return ret;
+	panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
 
 	if (op->remap.prev) {
 		prev_vma = panthor_vm_op_ctx_get_vma(op_ctx);
@@ -2154,13 +2186,9 @@ static int panthor_gpuva_sm_step_unmap(struct drm_gpuva_op *op,
 {
 	struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct panthor_vma, base);
 	struct panthor_vm *vm = priv;
-	int ret;
-
-	ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
-				     unmap_vma->base.va.range);
-	if (drm_WARN_ON(&vm->ptdev->base, ret))
-		return ret;
 
+	panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
+			       unmap_vma->base.va.range);
 	drm_gpuva_unmap(&op->unmap);
 	panthor_vma_unlink(vm, unmap_vma);
 	return 0;
@@ -2240,7 +2268,7 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 
 out:
 	if (ret && flag_vm_unusable_on_failure)
-		vm->unusable = true;
+		panthor_vm_declare_unusable(vm);
 
 	vm->op_ctx = NULL;
 	mutex_unlock(&vm->op_lock);
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset()
  2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (4 preceding siblings ...)
  2025-11-13 10:39 ` [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
@ 2025-11-13 10:39 ` Boris Brezillon
  2025-11-17 15:12   ` Steven Price
  5 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-13 10:39 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Groups are only moved out of the runnable lists when
panthor_group_stop() is called or when they run out of jobs.
What should not happen though is having one group added to one of
the runnable list after reset.in_progress has been set to true, but
that's not something we can easily check, so let's just drop the
WARN_ON() in panthor_sched_pre_reset().

v2:
- Adjust explanation in commit message

Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index e74ca071159d..d121b794bd79 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2836,8 +2836,6 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
 	 * new jobs while we're resetting.
 	 */
 	for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) {
-		/* All groups should be in the idle lists. */
-		drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i]));
 		list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node)
 			panthor_group_stop(group);
 	}
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS
  2025-11-13 10:39 ` [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
@ 2025-11-17 12:29   ` Steven Price
  0 siblings, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-11-17 12:29 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> There's currently no situation where we want to issue a command to an
> AS and not wait for this command to complete. The wait is either
> explicitly done (LOCK, UNLOCK) or it's missing (UPDATE). So let's
> turn write_cmd() into as_send_cmd_and_wait() that has the wait after
> a command is sent.
> 
> v2:
> - New patch
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c | 27 ++++++++++++---------------
>  1 file changed, 12 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 6a41dfd7aaf3..186048fc2c25 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -524,27 +524,29 @@ static int wait_ready(struct panthor_device *ptdev, u32 as_nr)
>  	return ret;
>  }
>  
> -static int write_cmd(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
> +static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
>  {
>  	int status;
>  
>  	/* write AS_COMMAND when MMU is ready to accept another command */
>  	status = wait_ready(ptdev, as_nr);
> -	if (!status)
> +	if (!status) {
>  		gpu_write(ptdev, AS_COMMAND(as_nr), cmd);
> +		status = wait_ready(ptdev, as_nr);
> +	}
>  
>  	return status;
>  }
>  
> -static void lock_region(struct panthor_device *ptdev, u32 as_nr,
> -			u64 region_start, u64 size)
> +static int lock_region(struct panthor_device *ptdev, u32 as_nr,
> +		       u64 region_start, u64 size)
>  {
>  	u8 region_width;
>  	u64 region;
>  	u64 region_end = region_start + size;
>  
>  	if (!size)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * The locked region is a naturally aligned power of 2 block encoded as
> @@ -567,7 +569,7 @@ static void lock_region(struct panthor_device *ptdev, u32 as_nr,
>  
>  	/* Lock the region that needs to be updated */
>  	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
> -	write_cmd(ptdev, as_nr, AS_COMMAND_LOCK);
> +	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
>  }
>  
>  static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
> @@ -600,9 +602,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
>  	 * power it up
>  	 */
>  
> -	lock_region(ptdev, as_nr, iova, size);
> -
> -	ret = wait_ready(ptdev, as_nr);
> +	ret = lock_region(ptdev, as_nr, iova, size);
>  	if (ret)
>  		return ret;
>  
> @@ -615,10 +615,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
>  	 * at the end of the GPU_CONTROL cache flush command, unlike
>  	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
>  	 */
> -	write_cmd(ptdev, as_nr, AS_COMMAND_UNLOCK);
> -
> -	/* Wait for the unlock command to complete */
> -	return wait_ready(ptdev, as_nr);
> +	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
>  }
>  
>  static int mmu_hw_do_operation(struct panthor_vm *vm,
> @@ -647,7 +644,7 @@ static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
>  	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
>  	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
>  
> -	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
> +	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
>  }
>  
>  static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
> @@ -662,7 +659,7 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
>  	gpu_write64(ptdev, AS_MEMATTR(as_nr), 0);
>  	gpu_write64(ptdev, AS_TRANSCFG(as_nr), AS_TRANSCFG_ADRMODE_UNMAPPED);
>  
> -	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
> +	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
>  }
>  
>  static u32 panthor_mmu_fault_mask(struct panthor_device *ptdev, u32 value)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/6] drm/panthor: Kill lock_region()
  2025-11-13 10:39 ` [PATCH v2 2/6] drm/panthor: Kill lock_region() Boris Brezillon
@ 2025-11-17 12:44   ` Steven Price
  2025-11-17 15:46     ` Boris Brezillon
  0 siblings, 1 reply; 16+ messages in thread
From: Steven Price @ 2025-11-17 12:44 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> The meat in lock_region() is about packing a region range into a
> single u64. The rest is just a regular reg write plus a
> as_send_cmd_and_wait() call that can easily be inlined in
> mmu_hw_do_operation_locked().
> 
> v2:
> - New patch
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c | 14 +++++---------
>  1 file changed, 5 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 186048fc2c25..f109c1588186 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -538,11 +538,9 @@ static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd
>  	return status;
>  }
>  
> -static int lock_region(struct panthor_device *ptdev, u32 as_nr,
> -		       u64 region_start, u64 size)
> +static u64 pack_region_range(u64 region_start, u64 size)
>  {
>  	u8 region_width;
> -	u64 region;
>  	u64 region_end = region_start + size;
>  
>  	if (!size)
Extra context:
>		return 0;
Rather than skipping the lock for size==0 you are now performing a lock
with LOCKADDR=0. The best documentation I can find for that says (for
LOCKADDR_SIZE in the Midgard architecture): "Values in the range 0 to 10
are undefined and should not be used".

I think later versions upped the minimum to 14 (log2(1<<15)-1) for
reasons due to supporting larger page sizes.

While I suspect this might work (since we don't actually need a lock
region when size==0) I don't think we should be relying on undefined
behaviour.

I think the simple solution is to move the size==0 check up into the caller.

Thanks,
Steve

> @@ -565,11 +563,7 @@ static int lock_region(struct panthor_device *ptdev, u32 as_nr,
>  	 */
>  	region_start &= GENMASK_ULL(63, region_width);
>  
> -	region = region_width | region_start;
> -
> -	/* Lock the region that needs to be updated */
> -	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
> -	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
> +	return region_width | region_start;
>  }
>  
>  static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
> @@ -602,7 +596,9 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
>  	 * power it up
>  	 */
>  
> -	ret = lock_region(ptdev, as_nr, iova, size);
> +	/* Lock the region that needs to be updated */
> +	gpu_write64(ptdev, AS_LOCKADDR(as_nr), pack_region_range(iova, size));
> +	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
>  	if (ret)
>  		return ret;
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures
  2025-11-13 10:39 ` [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
@ 2025-11-17 14:02   ` Steven Price
  2025-11-17 15:44     ` Boris Brezillon
  0 siblings, 1 reply; 16+ messages in thread
From: Steven Price @ 2025-11-17 14:02 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> We have seen a few cases where the whole memory subsystem is blocked
> and flush operations never complete. When that happens, we want to:
> 
> - schedule a reset, so we can recover from this situation
> - in the reset path, we need to reset the pending_reqs so we can send
>   new commands after the reset
> - if more panthor_gpu_flush_caches() operations are queued after
>   the timeout, we skip them and return -EIO directly to avoid needless
>   waits (the memory block won't miraculously work again)

You've removed the WARN from this last case. Is this intentional? I
agree the recovery is better, but I don't think we expect this to happen
- so it's pointing to something else being broken.

> 
> v2:
> - New patch
> 
> Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
> index eda670229184..abd2fde04da9 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> @@ -283,38 +283,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
>  int panthor_gpu_flush_caches(struct panthor_device *ptdev,
>  			     u32 l2, u32 lsc, u32 other)
>  {
> -	bool timedout = false;
>  	unsigned long flags;
> +	int ret = 0;
>  
>  	/* Serialize cache flush operations. */
>  	guard(mutex)(&ptdev->gpu->cache_flush_lock);
>  
>  	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
> -	if (!drm_WARN_ON(&ptdev->base,
> -			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
> +	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
>  		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
>  		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
> +	} else {
> +		ret = -EIO;
>  	}
>  	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>  
> +	if (ret)
> +		return ret;
> +
>  	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
>  				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
>  				msecs_to_jiffies(100))) {
>  		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
>  		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
>  		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
> -			timedout = true;
> +			ret = -ETIMEDOUT;
>  		else
>  			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
>  		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>  	}
>  
> -	if (timedout) {
> +	if (ret) {
> +		panthor_device_schedule_reset(ptdev);
>  		drm_err(&ptdev->base, "Flush caches timeout");
> -		return -ETIMEDOUT;
>  	}
>  
> -	return 0;
> +	return ret;
>  }
>  
>  /**
> @@ -354,6 +358,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
>  		return -ETIMEDOUT;
>  	}
>  
> +	ptdev->gpu->pending_reqs = 0;
>  	return 0;
>  }
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates
  2025-11-13 10:39 ` [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
@ 2025-11-17 14:26   ` Steven Price
  0 siblings, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-11-17 14:26 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> Move the lock/flush_mem operations around the gpuvm_sm_[un]map() calls
> so we can implement true atomic page updates, where any access in the
> locked range done by the GPU has to wait for the page table updates
> to land before proceeding.
> 
> This is needed for vkQueueBindSparse(), so we can replace the dummy
> page mapped over the entire object by actual BO backed pages in an atomic
> way. But it's also useful to avoid "AS_ACTIVE bit stuck" failures in
> the sm_[un]map() path, leading to gpuvm state inconsistencies.
> 
> v2:
> - Adjust to match the two new preliminary patches
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Looks fine to me.

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c | 184 +++++++++++++-------------
>  1 file changed, 95 insertions(+), 89 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index f109c1588186..21389137a324 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -389,6 +389,15 @@ struct panthor_vm {
>  	 * flagged as faulty as a result.
>  	 */
>  	bool unhandled_fault;
> +
> +	/** @locked_region: Information about the currently locked region currently. */
> +	struct {
> +		/** @locked_region.start: Start of the locked region. */
> +		u64 start;
> +
> +		/** @locked_region.size: Size of the locked region. */
> +		u64 size;
> +	} locked_region;
>  };
>  
>  /**
> @@ -566,76 +575,9 @@ static u64 pack_region_range(u64 region_start, u64 size)
>  	return region_width | region_start;
>  }
>  
> -static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
> -				      u64 iova, u64 size, u32 op)
> -{
> -	const u32 l2_flush_op = CACHE_CLEAN | CACHE_INV;
> -	u32 lsc_flush_op;
> -	int ret;
> -
> -	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
> -
> -	switch (op) {
> -	case AS_COMMAND_FLUSH_MEM:
> -		lsc_flush_op = CACHE_CLEAN | CACHE_INV;
> -		break;
> -	case AS_COMMAND_FLUSH_PT:
> -		lsc_flush_op = 0;
> -		break;
> -	default:
> -		drm_WARN(&ptdev->base, 1, "Unexpected AS_COMMAND: %d", op);
> -		return -EINVAL;
> -	}
> -
> -	if (as_nr < 0)
> -		return 0;
> -
> -	/*
> -	 * If the AS number is greater than zero, then we can be sure
> -	 * the device is up and running, so we don't need to explicitly
> -	 * power it up
> -	 */
> -
> -	/* Lock the region that needs to be updated */
> -	gpu_write64(ptdev, AS_LOCKADDR(as_nr), pack_region_range(iova, size));
> -	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
> -	if (ret)
> -		return ret;
> -
> -	ret = panthor_gpu_flush_caches(ptdev, l2_flush_op, lsc_flush_op, 0);
> -	if (ret)
> -		return ret;
> -
> -	/*
> -	 * Explicitly unlock the region as the AS is not unlocked automatically
> -	 * at the end of the GPU_CONTROL cache flush command, unlike
> -	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
> -	 */
> -	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
> -}
> -
> -static int mmu_hw_do_operation(struct panthor_vm *vm,
> -			       u64 iova, u64 size, u32 op)
> -{
> -	struct panthor_device *ptdev = vm->ptdev;
> -	int ret;
> -
> -	mutex_lock(&ptdev->mmu->as.slots_lock);
> -	ret = mmu_hw_do_operation_locked(ptdev, vm->as.id, iova, size, op);
> -	mutex_unlock(&ptdev->mmu->as.slots_lock);
> -
> -	return ret;
> -}
> -
>  static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
>  				 u64 transtab, u64 transcfg, u64 memattr)
>  {
> -	int ret;
> -
> -	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
> -	if (ret)
> -		return ret;
> -
>  	gpu_write64(ptdev, AS_TRANSTAB(as_nr), transtab);
>  	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
>  	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
> @@ -647,7 +589,9 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
>  {
>  	int ret;
>  
> -	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
> +	/* Flush+invalidate RW caches, invalidate RO ones. */
> +	ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
> +				       CACHE_CLEAN | CACHE_INV, CACHE_INV);
>  	if (ret)
>  		return ret;
>  
> @@ -729,6 +673,10 @@ int panthor_vm_active(struct panthor_vm *vm)
>  	if (refcount_inc_not_zero(&vm->as.active_cnt))
>  		goto out_dev_exit;
>  
> +	/* Make sure we don't race with lock/unlock_region() calls
> +	 * happening around VM bind operations.
> +	 */
> +	mutex_lock(&vm->op_lock);
>  	mutex_lock(&ptdev->mmu->as.slots_lock);
>  
>  	if (refcount_inc_not_zero(&vm->as.active_cnt))
> @@ -796,6 +744,10 @@ int panthor_vm_active(struct panthor_vm *vm)
>  		gpu_write(ptdev, MMU_INT_MASK, ~ptdev->mmu->as.faulty_mask);
>  	}
>  
> +	/* The VM update is guarded by ::op_lock, which we take at the beginning
> +	 * of this function, so we don't expect any locked region here.
> +	 */
> +	drm_WARN_ON(&vm->ptdev->base, vm->locked_region.size > 0);
>  	ret = panthor_mmu_as_enable(vm->ptdev, vm->as.id, transtab, transcfg, vm->memattr);
>  
>  out_make_active:
> @@ -806,6 +758,7 @@ int panthor_vm_active(struct panthor_vm *vm)
>  
>  out_unlock:
>  	mutex_unlock(&ptdev->mmu->as.slots_lock);
> +	mutex_unlock(&vm->op_lock);
>  
>  out_dev_exit:
>  	drm_dev_exit(cookie);
> @@ -889,30 +842,15 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
>  	return SZ_2M;
>  }
>  
> -static int panthor_vm_flush_range(struct panthor_vm *vm, u64 iova, u64 size)
> -{
> -	struct panthor_device *ptdev = vm->ptdev;
> -	int ret = 0, cookie;
> -
> -	if (vm->as.id < 0)
> -		return 0;
> -
> -	/* If the device is unplugged, we just silently skip the flush. */
> -	if (!drm_dev_enter(&ptdev->base, &cookie))
> -		return 0;
> -
> -	ret = mmu_hw_do_operation(vm, iova, size, AS_COMMAND_FLUSH_PT);
> -
> -	drm_dev_exit(cookie);
> -	return ret;
> -}
> -
>  static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
>  {
>  	struct panthor_device *ptdev = vm->ptdev;
>  	struct io_pgtable_ops *ops = vm->pgtbl_ops;
>  	u64 offset = 0;
>  
> +	drm_WARN_ON(&ptdev->base,
> +		    (iova < vm->locked_region.start) ||
> +		    (iova + size > vm->locked_region.start + vm->locked_region.size));
>  	drm_dbg(&ptdev->base, "unmap: as=%d, iova=%llx, len=%llx", vm->as.id, iova, size);
>  
>  	while (offset < size) {
> @@ -926,13 +864,12 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
>  				iova + offset + unmapped_sz,
>  				iova + offset + pgsize * pgcount,
>  				iova, iova + size);
> -			panthor_vm_flush_range(vm, iova, offset + unmapped_sz);
>  			return  -EINVAL;
>  		}
>  		offset += unmapped_sz;
>  	}
>  
> -	return panthor_vm_flush_range(vm, iova, size);
> +	return 0;
>  }
>  
>  static int
> @@ -949,6 +886,10 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
>  	if (!size)
>  		return 0;
>  
> +	drm_WARN_ON(&ptdev->base,
> +		    (iova < vm->locked_region.start) ||
> +		    (iova + size > vm->locked_region.start + vm->locked_region.size));
> +
>  	for_each_sgtable_dma_sg(sgt, sgl, count) {
>  		dma_addr_t paddr = sg_dma_address(sgl);
>  		size_t len = sg_dma_len(sgl);
> @@ -996,7 +937,7 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
>  		offset = 0;
>  	}
>  
> -	return panthor_vm_flush_range(vm, start_iova, iova - start_iova);
> +	return 0;
>  }
>  
>  static int flags_to_prot(u32 flags)
> @@ -1679,6 +1620,61 @@ static const char *access_type_name(struct panthor_device *ptdev,
>  	}
>  }
>  
> +static int panthor_vm_lock_region(struct panthor_vm *vm, u64 start, u64 size)
> +{
> +	struct panthor_device *ptdev = vm->ptdev;
> +	int ret = 0;
> +
> +	mutex_lock(&ptdev->mmu->as.slots_lock);
> +	drm_WARN_ON(&ptdev->base, vm->locked_region.start || vm->locked_region.size);
> +	if (vm->as.id >= 0 && size) {
> +		/* Lock the region that needs to be updated */
> +		gpu_write64(ptdev, AS_LOCKADDR(vm->as.id), pack_region_range(start, size));
> +
> +		/* If the lock succeeded, update the locked_region info. */
> +		ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_LOCK);
> +	}
> +
> +	if (!ret) {
> +		vm->locked_region.start = start;
> +		vm->locked_region.size = size;
> +	}
> +	mutex_unlock(&ptdev->mmu->as.slots_lock);
> +
> +	return ret;
> +}
> +
> +static void panthor_vm_unlock_region(struct panthor_vm *vm)
> +{
> +	struct panthor_device *ptdev = vm->ptdev;
> +
> +	mutex_lock(&ptdev->mmu->as.slots_lock);
> +	if (vm->as.id >= 0) {
> +		int ret;
> +
> +		/* flush+invalidate RW caches and invalidate RO ones.
> +		 * TODO: See if we can use FLUSH_PA_RANGE when the physical
> +		 * range is narrow enough and the HW supports it.
> +		 */
> +		ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
> +					       CACHE_CLEAN | CACHE_INV,
> +					       CACHE_INV);
> +
> +		/* Unlock the region if the flush is effective. */
> +		if (!ret)
> +			ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_UNLOCK);
> +
> +		/* If we fail to flush or unlock the region, schedule a GPU reset
> +		 * to unblock the situation.
> +		 */
> +		if (ret)
> +			panthor_device_schedule_reset(ptdev);
> +	}
> +	vm->locked_region.start = 0;
> +	vm->locked_region.size = 0;
> +	mutex_unlock(&ptdev->mmu->as.slots_lock);
> +}
> +
>  static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
>  {
>  	bool has_unhandled_faults = false;
> @@ -1883,6 +1879,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
>  	drm_sched_entity_destroy(&vm->entity);
>  	drm_sched_fini(&vm->sched);
>  
> +	mutex_lock(&vm->op_lock);
>  	mutex_lock(&ptdev->mmu->as.slots_lock);
>  	if (vm->as.id >= 0) {
>  		int cookie;
> @@ -1897,6 +1894,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
>  		list_del(&vm->as.lru_node);
>  	}
>  	mutex_unlock(&ptdev->mmu->as.slots_lock);
> +	mutex_unlock(&vm->op_lock);
>  
>  	free_io_pgtable_ops(vm->pgtbl_ops);
>  
> @@ -2206,6 +2204,11 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
>  
>  	mutex_lock(&vm->op_lock);
>  	vm->op_ctx = op;
> +
> +	ret = panthor_vm_lock_region(vm, op->va.addr, op->va.range);
> +	if (ret)
> +		goto out;
> +
>  	switch (op_type) {
>  	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: {
>  		const struct drm_gpuvm_map_req map_req = {
> @@ -2233,6 +2236,9 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
>  		break;
>  	}
>  
> +	panthor_vm_unlock_region(vm);
> +
> +out:
>  	if (ret && flag_vm_unusable_on_failure)
>  		vm->unusable = true;
>  


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust
  2025-11-13 10:39 ` [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
@ 2025-11-17 15:07   ` Steven Price
  0 siblings, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-11-17 15:07 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> There's no reason for panthor_vm_[un]map_pages() to fail unless the
> drm_gpuvm state and the page table are out of sync, so let's reflect that
> by making panthor_vm_unmap_pages() a void function and adding
> WARN_ON()s in various places. We also try to recover from those
> unexpected mismatch by checking for already unmapped ranges and skipping
> them. But there's only so much we can do to try and cope with such
> SW bugs, so when we see a mismatch, we flag the VM unusable and disable
> the AS to avoid further GPU accesses to the memory.
> 
> It could be that the as_disable() call fails because the MMU unit is
> stuck, in which case the whole GPU is frozen, and only a GPU reset can
> unblock things. Ater the reset, the VM will be seen as unusable and
> any attempt to re-use it will fail, so we should be covered for any
> use-after-unmap issues.
> 
> v2:
> - Fix double unlock
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_mmu.c | 82 ++++++++++++++++++---------
>  1 file changed, 55 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> index 21389137a324..35aad1e0ecaa 100644
> --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> @@ -842,12 +842,32 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
>  	return SZ_2M;
>  }
>  
> -static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
> +static void panthor_vm_declare_unusable(struct panthor_vm *vm)
> +{
> +	struct panthor_device *ptdev = vm->ptdev;
> +	int cookie;
> +
> +	if (vm->unusable)
> +		return;
> +
> +	vm->unusable = true;
> +	mutex_lock(&ptdev->mmu->as.slots_lock);
> +	if (vm->as.id >= 0 && drm_dev_enter(&ptdev->base, &cookie)) {
> +		panthor_mmu_as_disable(ptdev, vm->as.id);
> +		drm_dev_exit(cookie);
> +	}
> +	mutex_unlock(&ptdev->mmu->as.slots_lock);
> +}
> +
> +static void panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
>  {
>  	struct panthor_device *ptdev = vm->ptdev;
>  	struct io_pgtable_ops *ops = vm->pgtbl_ops;
>  	u64 offset = 0;
>  
> +	if (!size)
> +		return;
> +
>  	drm_WARN_ON(&ptdev->base,
>  		    (iova < vm->locked_region.start) ||
>  		    (iova + size > vm->locked_region.start + vm->locked_region.size));
> @@ -858,18 +878,32 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
>  		size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount);
>  
>  		unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, pgcount, NULL);
> +		if (drm_WARN_ON_ONCE(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
> +			/* Gracefully handle sparsely unmapped regions to avoid leaving
> +			 * page table pages behind when the drm_gpuvm and VM page table
> +			 * are out-of-sync. This is not supposed to happen, hence the
> +			 * above WARN_ON().
> +			 */
> +			while (!ops->iova_to_phys(ops, iova + unmapped_sz) &&
> +			       unmapped_sz < pgsize * pgcount)
> +				unmapped_sz += SZ_4K;
>  
> -		if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
> -			drm_err(&ptdev->base, "failed to unmap range %llx-%llx (requested range %llx-%llx)\n",
> -				iova + offset + unmapped_sz,
> -				iova + offset + pgsize * pgcount,
> -				iova, iova + size);
> -			return  -EINVAL;
> +			/* We're passed the point where we can try to fix things,
> +			 * so flag the VM unusable to make sure it's not going
> +			 * to be used anymore.
> +			 */
> +			panthor_vm_declare_unusable(vm);
> +
> +			/* If we don't make progress, we're screwed. That also means
> +			 * something else prevents us from unmapping the region, but
> +			 * there's not much we can do here: time for debugging.
> +			 */
> +			if (drm_WARN_ON_ONCE(&ptdev->base, !unmapped_sz))
> +				return;
>  		}
> +
>  		offset += unmapped_sz;
>  	}
> -
> -	return 0;
>  }
>  
>  static int
> @@ -917,16 +951,17 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
>  			paddr += mapped;
>  			len -= mapped;
>  
> -			if (drm_WARN_ON(&ptdev->base, !ret && !mapped))
> +			/* If nothing was mapped, consider it an ENOMEM. */
> +			if (!ret && !mapped)
>  				ret = -ENOMEM;
>  
> -			if (ret) {
> -				/* If something failed, unmap what we've already mapped before
> -				 * returning. The unmap call is not supposed to fail.
> +			/* If something fails, we stop there, and flag the VM unusable. */
> +			if (drm_WARN_ON_ONCE(&ptdev->base, ret)) {
> +				/* Unmap what we've already mapped to avoid leaving page
> +				 * table pages behind.
>  				 */
> -				drm_WARN_ON(&ptdev->base,
> -					    panthor_vm_unmap_pages(vm, start_iova,
> -								   iova - start_iova));
> +				panthor_vm_unmap_pages(vm, start_iova, iova - start_iova);
> +				panthor_vm_declare_unusable(vm);
>  				return ret;
>  			}
>  		}
> @@ -2109,12 +2144,9 @@ static int panthor_gpuva_sm_step_remap(struct drm_gpuva_op *op,
>  	struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
>  	struct panthor_vma *prev_vma = NULL, *next_vma = NULL;
>  	u64 unmap_start, unmap_range;
> -	int ret;
>  
>  	drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, &unmap_range);
> -	ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
> -	if (ret)
> -		return ret;
> +	panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
>  
>  	if (op->remap.prev) {
>  		prev_vma = panthor_vm_op_ctx_get_vma(op_ctx);
> @@ -2154,13 +2186,9 @@ static int panthor_gpuva_sm_step_unmap(struct drm_gpuva_op *op,
>  {
>  	struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct panthor_vma, base);
>  	struct panthor_vm *vm = priv;
> -	int ret;
> -
> -	ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
> -				     unmap_vma->base.va.range);
> -	if (drm_WARN_ON(&vm->ptdev->base, ret))
> -		return ret;
>  
> +	panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
> +			       unmap_vma->base.va.range);
>  	drm_gpuva_unmap(&op->unmap);
>  	panthor_vma_unlink(vm, unmap_vma);
>  	return 0;
> @@ -2240,7 +2268,7 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
>  
>  out:
>  	if (ret && flag_vm_unusable_on_failure)
> -		vm->unusable = true;
> +		panthor_vm_declare_unusable(vm);
>  
>  	vm->op_ctx = NULL;
>  	mutex_unlock(&vm->op_lock);


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset()
  2025-11-13 10:39 ` [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
@ 2025-11-17 15:12   ` Steven Price
  0 siblings, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-11-17 15:12 UTC (permalink / raw)
  To: Boris Brezillon, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On 13/11/2025 10:39, Boris Brezillon wrote:
> Groups are only moved out of the runnable lists when
> panthor_group_stop() is called or when they run out of jobs.
> What should not happen though is having one group added to one of
> the runnable list after reset.in_progress has been set to true, but
> that's not something we can easily check, so let's just drop the
> WARN_ON() in panthor_sched_pre_reset().
> 
> v2:
> - Adjust explanation in commit message
> 
> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>

Ah, so if I wasn't so far behind on my emails I'd have spotted this
updated version! ;)

That commit message looks fine.

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index e74ca071159d..d121b794bd79 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -2836,8 +2836,6 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
>  	 * new jobs while we're resetting.
>  	 */
>  	for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) {
> -		/* All groups should be in the idle lists. */
> -		drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i]));
>  		list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node)
>  			panthor_group_stop(group);
>  	}


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures
  2025-11-17 14:02   ` Steven Price
@ 2025-11-17 15:44     ` Boris Brezillon
  2025-11-17 16:10       ` Steven Price
  0 siblings, 1 reply; 16+ messages in thread
From: Boris Brezillon @ 2025-11-17 15:44 UTC (permalink / raw)
  To: Steven Price
  Cc: Liviu Dudau, Adrián Larumbe, dri-devel, Akash Goel,
	Karunika Choo, kernel

On Mon, 17 Nov 2025 14:02:35 +0000
Steven Price <steven.price@arm.com> wrote:

> On 13/11/2025 10:39, Boris Brezillon wrote:
> > We have seen a few cases where the whole memory subsystem is blocked
> > and flush operations never complete. When that happens, we want to:
> > 
> > - schedule a reset, so we can recover from this situation
> > - in the reset path, we need to reset the pending_reqs so we can send
> >   new commands after the reset
> > - if more panthor_gpu_flush_caches() operations are queued after
> >   the timeout, we skip them and return -EIO directly to avoid needless
> >   waits (the memory block won't miraculously work again)  
> 
> You've removed the WARN from this last case. Is this intentional? I
> agree the recovery is better, but I don't think we expect this to happen
> - so it's pointing to something else being broken.

I did because there's a way for the UMD to trigger that (see the link
to the bug in the cover letter) without any mitigation we can put in
place kernel side, other than the GPU reset I'm adding here.I
tend to use WARN_ON()s only for things the kernel code has control on,
not stuff users can force the kernel driver into. Note that I kept the
drm_err(), so we still have a trace of such errors in the logs (along
with some timeouts).

> 
> > 
> > v2:
> > - New patch
> > 
> > Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >  drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
> >  1 file changed, 12 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
> > index eda670229184..abd2fde04da9 100644
> > --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> > +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> > @@ -283,38 +283,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
> >  int panthor_gpu_flush_caches(struct panthor_device *ptdev,
> >  			     u32 l2, u32 lsc, u32 other)
> >  {
> > -	bool timedout = false;
> >  	unsigned long flags;
> > +	int ret = 0;
> >  
> >  	/* Serialize cache flush operations. */
> >  	guard(mutex)(&ptdev->gpu->cache_flush_lock);
> >  
> >  	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
> > -	if (!drm_WARN_ON(&ptdev->base,
> > -			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
> > +	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
> >  		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
> >  		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
> > +	} else {
> > +		ret = -EIO;
> >  	}
> >  	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
> >  
> > +	if (ret)
> > +		return ret;
> > +
> >  	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
> >  				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
> >  				msecs_to_jiffies(100))) {
> >  		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
> >  		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
> >  		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
> > -			timedout = true;
> > +			ret = -ETIMEDOUT;
> >  		else
> >  			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
> >  		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
> >  	}
> >  
> > -	if (timedout) {
> > +	if (ret) {
> > +		panthor_device_schedule_reset(ptdev);
> >  		drm_err(&ptdev->base, "Flush caches timeout");
> > -		return -ETIMEDOUT;
> >  	}
> >  
> > -	return 0;
> > +	return ret;
> >  }
> >  
> >  /**
> > @@ -354,6 +358,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
> >  		return -ETIMEDOUT;
> >  	}
> >  
> > +	ptdev->gpu->pending_reqs = 0;
> >  	return 0;
> >  }
> >    
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 2/6] drm/panthor: Kill lock_region()
  2025-11-17 12:44   ` Steven Price
@ 2025-11-17 15:46     ` Boris Brezillon
  0 siblings, 0 replies; 16+ messages in thread
From: Boris Brezillon @ 2025-11-17 15:46 UTC (permalink / raw)
  To: Steven Price
  Cc: Liviu Dudau, Adrián Larumbe, dri-devel, Akash Goel,
	Karunika Choo, kernel

On Mon, 17 Nov 2025 12:44:58 +0000
Steven Price <steven.price@arm.com> wrote:

> On 13/11/2025 10:39, Boris Brezillon wrote:
> > The meat in lock_region() is about packing a region range into a
> > single u64. The rest is just a regular reg write plus a
> > as_send_cmd_and_wait() call that can easily be inlined in
> > mmu_hw_do_operation_locked().
> > 
> > v2:
> > - New patch
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >  drivers/gpu/drm/panthor/panthor_mmu.c | 14 +++++---------
> >  1 file changed, 5 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
> > index 186048fc2c25..f109c1588186 100644
> > --- a/drivers/gpu/drm/panthor/panthor_mmu.c
> > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c
> > @@ -538,11 +538,9 @@ static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd
> >  	return status;
> >  }
> >  
> > -static int lock_region(struct panthor_device *ptdev, u32 as_nr,
> > -		       u64 region_start, u64 size)
> > +static u64 pack_region_range(u64 region_start, u64 size)
> >  {
> >  	u8 region_width;
> > -	u64 region;
> >  	u64 region_end = region_start + size;
> >  
> >  	if (!size)  
> Extra context:
> >		return 0;  
> Rather than skipping the lock for size==0 you are now performing a lock
> with LOCKADDR=0. The best documentation I can find for that says (for
> LOCKADDR_SIZE in the Midgard architecture): "Values in the range 0 to 10
> are undefined and should not be used".
> 
> I think later versions upped the minimum to 14 (log2(1<<15)-1) for
> reasons due to supporting larger page sizes.
> 
> While I suspect this might work (since we don't actually need a lock
> region when size==0) I don't think we should be relying on undefined
> behaviour.
> 
> I think the simple solution is to move the size==0 check up into the caller.

Right. I addressed that in patch 4 (which basically removes
mmu_hw_do_operation_locked()), but I can do it here too.

> 
> Thanks,
> Steve
> 
> > @@ -565,11 +563,7 @@ static int lock_region(struct panthor_device *ptdev, u32 as_nr,
> >  	 */
> >  	region_start &= GENMASK_ULL(63, region_width);
> >  
> > -	region = region_width | region_start;
> > -
> > -	/* Lock the region that needs to be updated */
> > -	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
> > -	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
> > +	return region_width | region_start;
> >  }
> >  
> >  static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
> > @@ -602,7 +596,9 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
> >  	 * power it up
> >  	 */
> >  
> > -	ret = lock_region(ptdev, as_nr, iova, size);
> > +	/* Lock the region that needs to be updated */
> > +	gpu_write64(ptdev, AS_LOCKADDR(as_nr), pack_region_range(iova, size));
> > +	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
> >  	if (ret)
> >  		return ret;
> >    
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures
  2025-11-17 15:44     ` Boris Brezillon
@ 2025-11-17 16:10       ` Steven Price
  0 siblings, 0 replies; 16+ messages in thread
From: Steven Price @ 2025-11-17 16:10 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Liviu Dudau, Adrián Larumbe, dri-devel, Akash Goel,
	Karunika Choo, kernel

On 17/11/2025 15:44, Boris Brezillon wrote:
> On Mon, 17 Nov 2025 14:02:35 +0000
> Steven Price <steven.price@arm.com> wrote:
> 
>> On 13/11/2025 10:39, Boris Brezillon wrote:
>>> We have seen a few cases where the whole memory subsystem is blocked
>>> and flush operations never complete. When that happens, we want to:
>>>
>>> - schedule a reset, so we can recover from this situation
>>> - in the reset path, we need to reset the pending_reqs so we can send
>>>   new commands after the reset
>>> - if more panthor_gpu_flush_caches() operations are queued after
>>>   the timeout, we skip them and return -EIO directly to avoid needless
>>>   waits (the memory block won't miraculously work again)  
>>
>> You've removed the WARN from this last case. Is this intentional? I
>> agree the recovery is better, but I don't think we expect this to happen
>> - so it's pointing to something else being broken.
> 
> I did because there's a way for the UMD to trigger that (see the link
> to the bug in the cover letter) without any mitigation we can put in
> place kernel side, other than the GPU reset I'm adding here.I
> tend to use WARN_ON()s only for things the kernel code has control on,
> not stuff users can force the kernel driver into. Note that I kept the
> drm_err(), so we still have a trace of such errors in the logs (along
> with some timeouts).

Ah, fair enough - would be good to put that in the commit message ;) The
cover letter for this posting doesn't seem to have the link either. With
an updated commit message:

Reviewed-by: Steven Price <steven.price@arm.com>

>>
>>>
>>> v2:
>>> - New patch
>>>
>>> Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
>>> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
>>> ---
>>>  drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
>>>  1 file changed, 12 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
>>> index eda670229184..abd2fde04da9 100644
>>> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
>>> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
>>> @@ -283,38 +283,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
>>>  int panthor_gpu_flush_caches(struct panthor_device *ptdev,
>>>  			     u32 l2, u32 lsc, u32 other)
>>>  {
>>> -	bool timedout = false;
>>>  	unsigned long flags;
>>> +	int ret = 0;
>>>  
>>>  	/* Serialize cache flush operations. */
>>>  	guard(mutex)(&ptdev->gpu->cache_flush_lock);
>>>  
>>>  	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
>>> -	if (!drm_WARN_ON(&ptdev->base,
>>> -			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
>>> +	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
>>>  		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
>>>  		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
>>> +	} else {
>>> +		ret = -EIO;
>>>  	}
>>>  	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>>>  
>>> +	if (ret)
>>> +		return ret;
>>> +
>>>  	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
>>>  				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
>>>  				msecs_to_jiffies(100))) {
>>>  		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
>>>  		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
>>>  		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
>>> -			timedout = true;
>>> +			ret = -ETIMEDOUT;
>>>  		else
>>>  			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
>>>  		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>>>  	}
>>>  
>>> -	if (timedout) {
>>> +	if (ret) {
>>> +		panthor_device_schedule_reset(ptdev);
>>>  		drm_err(&ptdev->base, "Flush caches timeout");
>>> -		return -ETIMEDOUT;
>>>  	}
>>>  
>>> -	return 0;
>>> +	return ret;
>>>  }
>>>  
>>>  /**
>>> @@ -354,6 +358,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
>>>  		return -ETIMEDOUT;
>>>  	}
>>>  
>>> +	ptdev->gpu->pending_reqs = 0;
>>>  	return 0;
>>>  }
>>>    
>>
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-11-17 16:10 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13 10:39 [PATCH v2 0/6] drm/panthor: Misc fixes Boris Brezillon
2025-11-13 10:39 ` [PATCH v2 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
2025-11-17 12:29   ` Steven Price
2025-11-13 10:39 ` [PATCH v2 2/6] drm/panthor: Kill lock_region() Boris Brezillon
2025-11-17 12:44   ` Steven Price
2025-11-17 15:46     ` Boris Brezillon
2025-11-13 10:39 ` [PATCH v2 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
2025-11-17 14:02   ` Steven Price
2025-11-17 15:44     ` Boris Brezillon
2025-11-17 16:10       ` Steven Price
2025-11-13 10:39 ` [PATCH v2 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
2025-11-17 14:26   ` Steven Price
2025-11-13 10:39 ` [PATCH v2 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
2025-11-17 15:07   ` Steven Price
2025-11-13 10:39 ` [PATCH v2 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
2025-11-17 15:12   ` Steven Price

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.