[PATCH v4 0/6] drm/panthor: Misc fixes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 0/6] drm/panthor: Misc fixes
@ 2025-11-28  8:48 Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Hello,

This is a set of fixes for bugs I ran into while looking at [1].
Hopefully that's enough to recover from AS_ACTIVE bit stuck
situations, but it'd be good to understand why the MMU block is
completely blocked in some cases and try to come up with better
mitigations than a full GPU reset.

This v4 is here to address a conflict caused by other panthor
patches being merged before this one.

Regards,

Boris

[1]https://gitlab.freedesktop.org/panfrost/linux/-/issues/57

Boris Brezillon (6):
  drm/panthor: Always wait after sending a command to an AS
  drm/panthor: Kill lock_region()
  drm/panthor: Recover from panthor_gpu_flush_caches() failures
  drm/panthor: Add support for atomic page table updates
  drm/panthor: Make panthor_vm_[un]map_pages() more robust
  drm/panthor: Relax a check in panthor_sched_pre_reset()

 drivers/gpu/drm/panthor/panthor_gpu.c   |  19 +-
 drivers/gpu/drm/panthor/panthor_mmu.c   | 280 +++++++++++++-----------
 drivers/gpu/drm/panthor/panthor_sched.c |   2 -
 3 files changed, 166 insertions(+), 135 deletions(-)

-- 
2.51.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v4 1/6] drm/panthor: Always wait after sending a command to an AS
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 2/6] drm/panthor: Kill lock_region() Boris Brezillon
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

There's currently no situation where we want to issue a command to an
AS and not wait for this command to complete. The wait is either
explicitly done (LOCK, UNLOCK) or it's missing (UPDATE). So let's
turn write_cmd() into as_send_cmd_and_wait() that has the wait after
a command is sent.

v2:
- New patch

v3:
- Collect R-b

v4:
- No changes

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 8818d6a8d93e..f59331f89b33 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -524,27 +524,29 @@ static int wait_ready(struct panthor_device *ptdev, u32 as_nr)
 	return ret;
 }
 
-static int write_cmd(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
+static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd)
 {
 	int status;
 
 	/* write AS_COMMAND when MMU is ready to accept another command */
 	status = wait_ready(ptdev, as_nr);
-	if (!status)
+	if (!status) {
 		gpu_write(ptdev, AS_COMMAND(as_nr), cmd);
+		status = wait_ready(ptdev, as_nr);
+	}
 
 	return status;
 }
 
-static void lock_region(struct panthor_device *ptdev, u32 as_nr,
-			u64 region_start, u64 size)
+static int lock_region(struct panthor_device *ptdev, u32 as_nr,
+		       u64 region_start, u64 size)
 {
 	u8 region_width;
 	u64 region;
 	u64 region_end = region_start + size;
 
 	if (!size)
-		return;
+		return 0;
 
 	/*
 	 * The locked region is a naturally aligned power of 2 block encoded as
@@ -567,7 +569,7 @@ static void lock_region(struct panthor_device *ptdev, u32 as_nr,
 
 	/* Lock the region that needs to be updated */
 	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
-	write_cmd(ptdev, as_nr, AS_COMMAND_LOCK);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
 }
 
 static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
@@ -600,9 +602,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * power it up
 	 */
 
-	lock_region(ptdev, as_nr, iova, size);
-
-	ret = wait_ready(ptdev, as_nr);
+	ret = lock_region(ptdev, as_nr, iova, size);
 	if (ret)
 		return ret;
 
@@ -615,10 +615,7 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * at the end of the GPU_CONTROL cache flush command, unlike
 	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
 	 */
-	write_cmd(ptdev, as_nr, AS_COMMAND_UNLOCK);
-
-	/* Wait for the unlock command to complete */
-	return wait_ready(ptdev, as_nr);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
 }
 
 static int mmu_hw_do_operation(struct panthor_vm *vm,
@@ -647,7 +644,7 @@ static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
 
-	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
 }
 
 static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
@@ -662,7 +659,7 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), 0);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), AS_TRANSCFG_ADRMODE_UNMAPPED);
 
-	return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE);
+	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UPDATE);
 }
 
 static u32 panthor_mmu_fault_mask(struct panthor_device *ptdev, u32 value)
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 2/6] drm/panthor: Kill lock_region()
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

The meat in lock_region() is about packing a region range into a
single u64. The rest is just a regular reg write plus a
as_send_cmd_and_wait() call that can easily be inlined in
mmu_hw_do_operation_locked().

v2:
- New patch

v3:
- Don't LOCK is the region has a zero size

v4:
- Collect R-b

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index f59331f89b33..b88a6d3096a0 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -538,14 +538,12 @@ static int as_send_cmd_and_wait(struct panthor_device *ptdev, u32 as_nr, u32 cmd
 	return status;
 }
 
-static int lock_region(struct panthor_device *ptdev, u32 as_nr,
-		       u64 region_start, u64 size)
+static u64 pack_region_range(struct panthor_device *ptdev, u64 region_start, u64 size)
 {
 	u8 region_width;
-	u64 region;
 	u64 region_end = region_start + size;
 
-	if (!size)
+	if (drm_WARN_ON_ONCE(&ptdev->base, !size))
 		return 0;
 
 	/*
@@ -565,11 +563,7 @@ static int lock_region(struct panthor_device *ptdev, u32 as_nr,
 	 */
 	region_start &= GENMASK_ULL(63, region_width);
 
-	region = region_width | region_start;
-
-	/* Lock the region that needs to be updated */
-	gpu_write64(ptdev, AS_LOCKADDR(as_nr), region);
-	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
+	return region_width | region_start;
 }
 
 static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
@@ -581,6 +575,9 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 
 	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
 
+	if (!size)
+		return 0;
+
 	switch (op) {
 	case AS_COMMAND_FLUSH_MEM:
 		lsc_flush_op = CACHE_CLEAN | CACHE_INV;
@@ -602,7 +599,10 @@ static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
 	 * power it up
 	 */
 
-	ret = lock_region(ptdev, as_nr, iova, size);
+	/* Lock the region that needs to be updated */
+	gpu_write64(ptdev, AS_LOCKADDR(as_nr),
+		    pack_region_range(ptdev, iova, size));
+	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
 	if (ret)
 		return ret;
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 2/6] drm/panthor: Kill lock_region() Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

We have seen a few cases where the whole memory subsystem is blocked
and flush operations never complete. When that happens, we want to:

- schedule a reset, so we can recover from this situation
- in the reset path, we need to reset the pending_reqs so we can send
  new commands after the reset
- if more panthor_gpu_flush_caches() operations are queued after
  the timeout, we skip them and return -EIO directly to avoid needless
  waits (the memory block won't miraculously work again)

Note that we drop the WARN_ON()s because these hangs can be triggered
with buggy GPU jobs created by the UMD, and there's no way we can
prevent it. We do keep the error messages though.

v2:
- New patch

v3:
- Collect R-b
- Explicitly mention the fact we dropped the WARN_ON()s in the commit
  message

v4:
- No changes

Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
index 06b231b2460a..9cb5dee93212 100644
--- a/drivers/gpu/drm/panthor/panthor_gpu.c
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -289,38 +289,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
 int panthor_gpu_flush_caches(struct panthor_device *ptdev,
 			     u32 l2, u32 lsc, u32 other)
 {
-	bool timedout = false;
 	unsigned long flags;
+	int ret = 0;
 
 	/* Serialize cache flush operations. */
 	guard(mutex)(&ptdev->gpu->cache_flush_lock);
 
 	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
-	if (!drm_WARN_ON(&ptdev->base,
-			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
+	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
 		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
+	} else {
+		ret = -EIO;
 	}
 	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 
+	if (ret)
+		return ret;
+
 	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
 				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
 				msecs_to_jiffies(100))) {
 		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
 		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
 		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
-			timedout = true;
+			ret = -ETIMEDOUT;
 		else
 			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 	}
 
-	if (timedout) {
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
 		drm_err(&ptdev->base, "Flush caches timeout");
-		return -ETIMEDOUT;
 	}
 
-	return 0;
+	return ret;
 }
 
 /**
@@ -360,6 +364,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
 		return -ETIMEDOUT;
 	}
 
+	ptdev->gpu->pending_reqs = 0;
 	return 0;
 }
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 4/6] drm/panthor: Add support for atomic page table updates
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (2 preceding siblings ...)
  2025-11-28  8:48 ` [PATCH v4 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Move the lock/flush_mem operations around the gpuvm_sm_[un]map() calls
so we can implement true atomic page updates, where any access in the
locked range done by the GPU has to wait for the page table updates
to land before proceeding.

This is needed for vkQueueBindSparse(), so we can replace the dummy
page mapped over the entire object by actual BO backed pages in an atomic
way. But it's also useful to avoid "AS_ACTIVE bit stuck" failures in
the sm_[un]map() path, leading to gpuvm state inconsistencies.

v2:
- Adjust to match the two new preliminary patches

v3:
- Collect R-b

v4:
- No changes

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 190 +++++++++++++-------------
 1 file changed, 97 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index b88a6d3096a0..f39e6e799c74 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -389,6 +389,15 @@ struct panthor_vm {
 	 * flagged as faulty as a result.
 	 */
 	bool unhandled_fault;
+
+	/** @locked_region: Information about the currently locked region currently. */
+	struct {
+		/** @locked_region.start: Start of the locked region. */
+		u64 start;
+
+		/** @locked_region.size: Size of the locked region. */
+		u64 size;
+	} locked_region;
 };
 
 /**
@@ -566,80 +575,9 @@ static u64 pack_region_range(struct panthor_device *ptdev, u64 region_start, u64
 	return region_width | region_start;
 }
 
-static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr,
-				      u64 iova, u64 size, u32 op)
-{
-	const u32 l2_flush_op = CACHE_CLEAN | CACHE_INV;
-	u32 lsc_flush_op;
-	int ret;
-
-	lockdep_assert_held(&ptdev->mmu->as.slots_lock);
-
-	if (!size)
-		return 0;
-
-	switch (op) {
-	case AS_COMMAND_FLUSH_MEM:
-		lsc_flush_op = CACHE_CLEAN | CACHE_INV;
-		break;
-	case AS_COMMAND_FLUSH_PT:
-		lsc_flush_op = 0;
-		break;
-	default:
-		drm_WARN(&ptdev->base, 1, "Unexpected AS_COMMAND: %d", op);
-		return -EINVAL;
-	}
-
-	if (as_nr < 0)
-		return 0;
-
-	/*
-	 * If the AS number is greater than zero, then we can be sure
-	 * the device is up and running, so we don't need to explicitly
-	 * power it up
-	 */
-
-	/* Lock the region that needs to be updated */
-	gpu_write64(ptdev, AS_LOCKADDR(as_nr),
-		    pack_region_range(ptdev, iova, size));
-	ret = as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_LOCK);
-	if (ret)
-		return ret;
-
-	ret = panthor_gpu_flush_caches(ptdev, l2_flush_op, lsc_flush_op, 0);
-	if (ret)
-		return ret;
-
-	/*
-	 * Explicitly unlock the region as the AS is not unlocked automatically
-	 * at the end of the GPU_CONTROL cache flush command, unlike
-	 * AS_COMMAND_FLUSH_MEM or AS_COMMAND_FLUSH_PT.
-	 */
-	return as_send_cmd_and_wait(ptdev, as_nr, AS_COMMAND_UNLOCK);
-}
-
-static int mmu_hw_do_operation(struct panthor_vm *vm,
-			       u64 iova, u64 size, u32 op)
-{
-	struct panthor_device *ptdev = vm->ptdev;
-	int ret;
-
-	mutex_lock(&ptdev->mmu->as.slots_lock);
-	ret = mmu_hw_do_operation_locked(ptdev, vm->as.id, iova, size, op);
-	mutex_unlock(&ptdev->mmu->as.slots_lock);
-
-	return ret;
-}
-
 static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr,
 				 u64 transtab, u64 transcfg, u64 memattr)
 {
-	int ret;
-
-	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
-	if (ret)
-		return ret;
-
 	gpu_write64(ptdev, AS_TRANSTAB(as_nr), transtab);
 	gpu_write64(ptdev, AS_MEMATTR(as_nr), memattr);
 	gpu_write64(ptdev, AS_TRANSCFG(as_nr), transcfg);
@@ -651,7 +589,9 @@ static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr)
 {
 	int ret;
 
-	ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
+	/* Flush+invalidate RW caches, invalidate RO ones. */
+	ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
+				       CACHE_CLEAN | CACHE_INV, CACHE_INV);
 	if (ret)
 		return ret;
 
@@ -733,6 +673,10 @@ int panthor_vm_active(struct panthor_vm *vm)
 	if (refcount_inc_not_zero(&vm->as.active_cnt))
 		goto out_dev_exit;
 
+	/* Make sure we don't race with lock/unlock_region() calls
+	 * happening around VM bind operations.
+	 */
+	mutex_lock(&vm->op_lock);
 	mutex_lock(&ptdev->mmu->as.slots_lock);
 
 	if (refcount_inc_not_zero(&vm->as.active_cnt))
@@ -800,6 +744,10 @@ int panthor_vm_active(struct panthor_vm *vm)
 		gpu_write(ptdev, MMU_INT_MASK, ~ptdev->mmu->as.faulty_mask);
 	}
 
+	/* The VM update is guarded by ::op_lock, which we take at the beginning
+	 * of this function, so we don't expect any locked region here.
+	 */
+	drm_WARN_ON(&vm->ptdev->base, vm->locked_region.size > 0);
 	ret = panthor_mmu_as_enable(vm->ptdev, vm->as.id, transtab, transcfg, vm->memattr);
 
 out_make_active:
@@ -810,6 +758,7 @@ int panthor_vm_active(struct panthor_vm *vm)
 
 out_unlock:
 	mutex_unlock(&ptdev->mmu->as.slots_lock);
+	mutex_unlock(&vm->op_lock);
 
 out_dev_exit:
 	drm_dev_exit(cookie);
@@ -893,24 +842,6 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
 	return SZ_2M;
 }
 
-static int panthor_vm_flush_range(struct panthor_vm *vm, u64 iova, u64 size)
-{
-	struct panthor_device *ptdev = vm->ptdev;
-	int ret = 0, cookie;
-
-	if (vm->as.id < 0)
-		return 0;
-
-	/* If the device is unplugged, we just silently skip the flush. */
-	if (!drm_dev_enter(&ptdev->base, &cookie))
-		return 0;
-
-	ret = mmu_hw_do_operation(vm, iova, size, AS_COMMAND_FLUSH_PT);
-
-	drm_dev_exit(cookie);
-	return ret;
-}
-
 static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 {
 	struct panthor_device *ptdev = vm->ptdev;
@@ -918,6 +849,10 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 	u64 start_iova = iova;
 	u64 offset = 0;
 
+	drm_WARN_ON(&ptdev->base,
+		    (iova < vm->locked_region.start) ||
+		    (iova + size > vm->locked_region.start + vm->locked_region.size));
+
 	while (offset < size) {
 		size_t unmapped_sz = 0, pgcount;
 		size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount);
@@ -929,7 +864,6 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 				iova + offset + unmapped_sz,
 				iova + offset + pgsize * pgcount,
 				iova, iova + size);
-			panthor_vm_flush_range(vm, iova, offset + unmapped_sz);
 			return  -EINVAL;
 		}
 
@@ -941,7 +875,7 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 		offset += unmapped_sz;
 	}
 
-	return panthor_vm_flush_range(vm, iova, size);
+	return 0;
 }
 
 static int
@@ -959,6 +893,10 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 	if (!size)
 		return 0;
 
+	drm_WARN_ON(&ptdev->base,
+		    (iova < vm->locked_region.start) ||
+		    (iova + size > vm->locked_region.start + vm->locked_region.size));
+
 	for_each_sgtable_dma_sg(sgt, sgl, count) {
 		dma_addr_t paddr = sg_dma_address(sgl);
 		size_t len = sg_dma_len(sgl);
@@ -1009,7 +947,7 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 		offset = 0;
 	}
 
-	return panthor_vm_flush_range(vm, start_iova, iova - start_iova);
+	return 0;
 }
 
 static int flags_to_prot(u32 flags)
@@ -1692,6 +1630,62 @@ static const char *access_type_name(struct panthor_device *ptdev,
 	}
 }
 
+static int panthor_vm_lock_region(struct panthor_vm *vm, u64 start, u64 size)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int ret = 0;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	drm_WARN_ON(&ptdev->base, vm->locked_region.start || vm->locked_region.size);
+	if (vm->as.id >= 0 && size) {
+		/* Lock the region that needs to be updated */
+		gpu_write64(ptdev, AS_LOCKADDR(vm->as.id),
+			    pack_region_range(ptdev, start, size));
+
+		/* If the lock succeeded, update the locked_region info. */
+		ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_LOCK);
+	}
+
+	if (!ret) {
+		vm->locked_region.start = start;
+		vm->locked_region.size = size;
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+
+	return ret;
+}
+
+static void panthor_vm_unlock_region(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	if (vm->as.id >= 0) {
+		int ret;
+
+		/* flush+invalidate RW caches and invalidate RO ones.
+		 * TODO: See if we can use FLUSH_PA_RANGE when the physical
+		 * range is narrow enough and the HW supports it.
+		 */
+		ret = panthor_gpu_flush_caches(ptdev, CACHE_CLEAN | CACHE_INV,
+					       CACHE_CLEAN | CACHE_INV,
+					       CACHE_INV);
+
+		/* Unlock the region if the flush is effective. */
+		if (!ret)
+			ret = as_send_cmd_and_wait(ptdev, vm->as.id, AS_COMMAND_UNLOCK);
+
+		/* If we fail to flush or unlock the region, schedule a GPU reset
+		 * to unblock the situation.
+		 */
+		if (ret)
+			panthor_device_schedule_reset(ptdev);
+	}
+	vm->locked_region.start = 0;
+	vm->locked_region.size = 0;
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
 static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status)
 {
 	bool has_unhandled_faults = false;
@@ -1896,6 +1890,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
 	drm_sched_entity_destroy(&vm->entity);
 	drm_sched_fini(&vm->sched);
 
+	mutex_lock(&vm->op_lock);
 	mutex_lock(&ptdev->mmu->as.slots_lock);
 	if (vm->as.id >= 0) {
 		int cookie;
@@ -1910,6 +1905,7 @@ static void panthor_vm_free(struct drm_gpuvm *gpuvm)
 		list_del(&vm->as.lru_node);
 	}
 	mutex_unlock(&ptdev->mmu->as.slots_lock);
+	mutex_unlock(&vm->op_lock);
 
 	free_io_pgtable_ops(vm->pgtbl_ops);
 
@@ -2219,6 +2215,11 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 
 	mutex_lock(&vm->op_lock);
 	vm->op_ctx = op;
+
+	ret = panthor_vm_lock_region(vm, op->va.addr, op->va.range);
+	if (ret)
+		goto out;
+
 	switch (op_type) {
 	case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: {
 		const struct drm_gpuvm_map_req map_req = {
@@ -2246,6 +2247,9 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 		break;
 	}
 
+	panthor_vm_unlock_region(vm);
+
+out:
 	if (ret && flag_vm_unusable_on_failure)
 		vm->unusable = true;
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (3 preceding siblings ...)
  2025-11-28  8:48 ` [PATCH v4 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  8:48 ` [PATCH v4 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
  2025-11-28  9:19 ` [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

There's no reason for panthor_vm_[un]map_pages() to fail unless the
drm_gpuvm state and the page table are out of sync, so let's reflect that
by making panthor_vm_unmap_pages() a void function and adding
WARN_ON()s in various places. We also try to recover from those
unexpected mismatch by checking for already unmapped ranges and skipping
them. But there's only so much we can do to try and cope with such
SW bugs, so when we see a mismatch, we flag the VM unusable and disable
the AS to avoid further GPU accesses to the memory.

It could be that the as_disable() call fails because the MMU unit is
stuck, in which case the whole GPU is frozen, and only a GPU reset can
unblock things. Ater the reset, the VM will be seen as unusable and
any attempt to re-use it will fail, so we should be covered for any
use-after-unmap issues.

v2:
- Fix double unlock

v3:
- Collect R-b

v4:
- No changes

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 81 ++++++++++++++++++---------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index f39e6e799c74..8ba5259e3d28 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -842,13 +842,33 @@ static size_t get_pgsize(u64 addr, size_t size, size_t *count)
 	return SZ_2M;
 }
 
-static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
+static void panthor_vm_declare_unusable(struct panthor_vm *vm)
+{
+	struct panthor_device *ptdev = vm->ptdev;
+	int cookie;
+
+	if (vm->unusable)
+		return;
+
+	vm->unusable = true;
+	mutex_lock(&ptdev->mmu->as.slots_lock);
+	if (vm->as.id >= 0 && drm_dev_enter(&ptdev->base, &cookie)) {
+		panthor_mmu_as_disable(ptdev, vm->as.id);
+		drm_dev_exit(cookie);
+	}
+	mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
+static void panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 {
 	struct panthor_device *ptdev = vm->ptdev;
 	struct io_pgtable_ops *ops = vm->pgtbl_ops;
 	u64 start_iova = iova;
 	u64 offset = 0;
 
+	if (!size)
+		return;
+
 	drm_WARN_ON(&ptdev->base,
 		    (iova < vm->locked_region.start) ||
 		    (iova + size > vm->locked_region.start + vm->locked_region.size));
@@ -858,13 +878,28 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 		size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount);
 
 		unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, pgcount, NULL);
+		if (drm_WARN_ON_ONCE(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
+			/* Gracefully handle sparsely unmapped regions to avoid leaving
+			 * page table pages behind when the drm_gpuvm and VM page table
+			 * are out-of-sync. This is not supposed to happen, hence the
+			 * above WARN_ON().
+			 */
+			while (!ops->iova_to_phys(ops, iova + unmapped_sz) &&
+			       unmapped_sz < pgsize * pgcount)
+				unmapped_sz += SZ_4K;
 
-		if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) {
-			drm_err(&ptdev->base, "failed to unmap range %llx-%llx (requested range %llx-%llx)\n",
-				iova + offset + unmapped_sz,
-				iova + offset + pgsize * pgcount,
-				iova, iova + size);
-			return  -EINVAL;
+			/* We're passed the point where we can try to fix things,
+			 * so flag the VM unusable to make sure it's not going
+			 * to be used anymore.
+			 */
+			panthor_vm_declare_unusable(vm);
+
+			/* If we don't make progress, we're screwed. That also means
+			 * something else prevents us from unmapping the region, but
+			 * there's not much we can do here: time for debugging.
+			 */
+			if (drm_WARN_ON_ONCE(&ptdev->base, !unmapped_sz))
+				return;
 		}
 
 		drm_dbg(&ptdev->base,
@@ -874,8 +909,6 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 
 		offset += unmapped_sz;
 	}
-
-	return 0;
 }
 
 static int
@@ -927,16 +960,17 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
 			paddr += mapped;
 			len -= mapped;
 
-			if (drm_WARN_ON(&ptdev->base, !ret && !mapped))
+			/* If nothing was mapped, consider it an ENOMEM. */
+			if (!ret && !mapped)
 				ret = -ENOMEM;
 
-			if (ret) {
-				/* If something failed, unmap what we've already mapped before
-				 * returning. The unmap call is not supposed to fail.
+			/* If something fails, we stop there, and flag the VM unusable. */
+			if (drm_WARN_ON_ONCE(&ptdev->base, ret)) {
+				/* Unmap what we've already mapped to avoid leaving page
+				 * table pages behind.
 				 */
-				drm_WARN_ON(&ptdev->base,
-					    panthor_vm_unmap_pages(vm, start_iova,
-								   iova - start_iova));
+				panthor_vm_unmap_pages(vm, start_iova, iova - start_iova);
+				panthor_vm_declare_unusable(vm);
 				return ret;
 			}
 		}
@@ -2120,12 +2154,9 @@ static int panthor_gpuva_sm_step_remap(struct drm_gpuva_op *op,
 	struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
 	struct panthor_vma *prev_vma = NULL, *next_vma = NULL;
 	u64 unmap_start, unmap_range;
-	int ret;
 
 	drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, &unmap_range);
-	ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
-	if (ret)
-		return ret;
+	panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
 
 	if (op->remap.prev) {
 		prev_vma = panthor_vm_op_ctx_get_vma(op_ctx);
@@ -2165,13 +2196,9 @@ static int panthor_gpuva_sm_step_unmap(struct drm_gpuva_op *op,
 {
 	struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct panthor_vma, base);
 	struct panthor_vm *vm = priv;
-	int ret;
-
-	ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
-				     unmap_vma->base.va.range);
-	if (drm_WARN_ON(&vm->ptdev->base, ret))
-		return ret;
 
+	panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
+			       unmap_vma->base.va.range);
 	drm_gpuva_unmap(&op->unmap);
 	panthor_vma_unlink(vm, unmap_vma);
 	return 0;
@@ -2251,7 +2278,7 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op,
 
 out:
 	if (ret && flag_vm_unusable_on_failure)
-		vm->unusable = true;
+		panthor_vm_declare_unusable(vm);
 
 	vm->op_ctx = NULL;
 	mutex_unlock(&vm->op_lock);
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset()
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (4 preceding siblings ...)
  2025-11-28  8:48 ` [PATCH v4 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
@ 2025-11-28  8:48 ` Boris Brezillon
  2025-11-28  9:19 ` [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  8:48 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

Groups are only moved out of the runnable lists when
panthor_group_stop() is called or when they run out of jobs.
What should not happen though is having one group added to one of
the runnable list after reset.in_progress has been set to true, but
that's not something we can easily check, so let's just drop the
WARN_ON() in panthor_sched_pre_reset().

v2:
- Adjust explanation in commit message

v3:
- Collect R-b

v4:
- No changes

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index b834123a6560..1beddc175722 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2937,8 +2937,6 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev)
 	 * new jobs while we're resetting.
 	 */
 	for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) {
-		/* All groups should be in the idle lists. */
-		drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i]));
 		list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node)
 			panthor_group_stop(group);
 	}
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 0/6] drm/panthor: Misc fixes
  2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
                   ` (5 preceding siblings ...)
  2025-11-28  8:48 ` [PATCH v4 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
@ 2025-11-28  9:19 ` Boris Brezillon
  6 siblings, 0 replies; 8+ messages in thread
From: Boris Brezillon @ 2025-11-28  9:19 UTC (permalink / raw)
  To: Boris Brezillon, Steven Price, Liviu Dudau, Adrián Larumbe
  Cc: dri-devel, Akash Goel, Karunika Choo, kernel

On Fri, 28 Nov 2025 09:48:34 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> Hello,
> 
> This is a set of fixes for bugs I ran into while looking at [1].
> Hopefully that's enough to recover from AS_ACTIVE bit stuck
> situations, but it'd be good to understand why the MMU block is
> completely blocked in some cases and try to come up with better
> mitigations than a full GPU reset.
> 
> This v4 is here to address a conflict caused by other panthor
> patches being merged before this one.
> 
> Regards,
> 
> Boris
> 
> [1]https://gitlab.freedesktop.org/panfrost/linux/-/issues/57
> 
> Boris Brezillon (6):
>   drm/panthor: Always wait after sending a command to an AS
>   drm/panthor: Kill lock_region()
>   drm/panthor: Recover from panthor_gpu_flush_caches() failures
>   drm/panthor: Add support for atomic page table updates
>   drm/panthor: Make panthor_vm_[un]map_pages() more robust
>   drm/panthor: Relax a check in panthor_sched_pre_reset()

Queued to drm-misc-next.

> 
>  drivers/gpu/drm/panthor/panthor_gpu.c   |  19 +-
>  drivers/gpu/drm/panthor/panthor_mmu.c   | 280 +++++++++++++-----------
>  drivers/gpu/drm/panthor/panthor_sched.c |   2 -
>  3 files changed, 166 insertions(+), 135 deletions(-)
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-11-28  9:19 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-28  8:48 [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 1/6] drm/panthor: Always wait after sending a command to an AS Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 2/6] drm/panthor: Kill lock_region() Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 3/6] drm/panthor: Recover from panthor_gpu_flush_caches() failures Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 4/6] drm/panthor: Add support for atomic page table updates Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 5/6] drm/panthor: Make panthor_vm_[un]map_pages() more robust Boris Brezillon
2025-11-28  8:48 ` [PATCH v4 6/6] drm/panthor: Relax a check in panthor_sched_pre_reset() Boris Brezillon
2025-11-28  9:19 ` [PATCH v4 0/6] drm/panthor: Misc fixes Boris Brezillon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.