From: Daniel J Blueman <daniel@quora.org>
To: Rob Clark <robin.clark@oss.qualcomm.com>,
Dmitry Baryshkov <lumag@kernel.org>,
Abhinav Kumar <abhinav.kumar@linux.dev>,
Jessica Zhang <jesszhan0024@gmail.com>,
Sean Paul <sean@poorly.run>,
Marijn Suijten <marijn.suijten@somainline.org>,
David Airlie <airlied@gmail.com>, Simona Vetter <simona@ffwll.ch>,
Antonino Maniscalco <antomani103@gmail.com>
Cc: Daniel J Blueman <daniel@quora.org>,
linux-arm-msm@vger.kernel.org, dri-devel@lists.freedesktop.org,
freedreno@lists.freedesktop.org, linux-kernel@vger.kernel.org,
stable@vger.kernel.org
Subject: [PATCH] drm/msm: Fix shrinker deadlock
Date: Fri, 8 May 2026 14:57:21 +0800 [thread overview]
Message-ID: <20260508065722.18785-1-daniel@quora.org> (raw)
With PROVE_LOCKING on an Snapdragon X1 and VM reclaim pressure, we see:
"""
kswapd0/121 is trying to acquire lock:
ffff800080ed3800 (reservation_ww_class_acquire){+.+.}-{0:0}, at:
msm_gem_shrinker_scan (drivers/gpu/drm/msm/msm_gem_shrinker.c:189)
but task is already holding lock:
ffffbf4ddb44ca40 (fs_reclaim){+.+.}-{0:0}, at:
balance_pgdat (mm/vmscan.c:7236 (discriminator 2))
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (fs_reclaim){+.+.}-{0:0}:
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
fs_reclaim_acquire (mm/page_alloc.c:4325 mm/page_alloc.c:4339)
dma_resv_lockdep (drivers/dma-buf/dma-resv.c:798)
do_one_initcall (init/main.c:1392)
kernel_init_freeable (init/main.c:1454 (discriminator 1) init/main.c:1470
(discriminator 1) init/main.c:1490 (discriminator 1) init/main.c:1703
(discriminator 1))
kernel_init (init/main.c:1593)
ret_from_fork (arch/arm64/kernel/entry.S:858)
-> #1 (reservation_ww_class_mutex){+.+.}-{4:4}:
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
dma_resv_lockdep (./include/linux/ww_mutex.h:164 (discriminator 1)
drivers/dma-buf/dma-resv.c:791 (discriminator 1))
do_one_initcall (init/main.c:1392)
kernel_init_freeable (init/main.c:1454 (discriminator 1) init/main.c:1470
(discriminator 1) init/main.c:1490 (discriminator 1) init/main.c:1703
(discriminator 1))
kernel_init (init/main.c:1593)
ret_from_fork (arch/arm64/kernel/entry.S:858)
-> #0 (reservation_ww_class_acquire){+.+.}-{0:0}:
check_prev_add (kernel/locking/lockdep.c:3165)
__lock_acquire (kernel/locking/lockdep.c:3284
kernel/locking/lockdep.c:3908 kernel/locking/lockdep.c:5237)
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
drm_gem_lru_scan (./include/linux/ww_mutex.h:163 (discriminator 1)
drivers/gpu/drm/drm_gem.c:1681 (discriminator 1))
msm_gem_shrinker_scan (drivers/gpu/drm/msm/msm_gem_shrinker.c:189)
do_shrink_slab (mm/shrinker.c:436)
shrink_slab (mm/shrinker.c:667)
shrink_one (mm/vmscan.c:5071)
shrink_node (mm/vmscan.c:5132 mm/vmscan.c:5210 mm/vmscan.c:6198)
balance_pgdat (mm/vmscan.c:7052 mm/vmscan.c:7228)
kswapd (mm/vmscan.c:7501)
kthread (kernel/kthread.c:436)
ret_from_fork (arch/arm64/kernel/entry.S:858)
other info that might help us debug this:
Chain exists of:
reservation_ww_class_acquire --> reservation_ww_class_mutex --> fs_reclaim
"""
kswapd0 holding fs_reclaim calls the MSM shrinker, which calls
dma_resv_lock. This in turn acquires fs_reclaim.
Fix this deadlock by using dma_resv_trylock() instead, dropping the
subsequently unused passed wait-wound lock 'ticket'.
Cc: stable@vger.kernel.org
Signed-off-by: Daniel J Blueman <daniel@quora.org>
Fixes: fe4952b5f27c ("drm/msm: Convert vm locking")
---
drivers/gpu/drm/msm/msm_gem_shrinker.c | 34 ++++++++++----------------
1 file changed, 13 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
index 31fa51a44f86..5320ef57dd90 100644
--- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
+++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
@@ -43,8 +43,7 @@ msm_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
}
static bool
-with_vm_locks(struct ww_acquire_ctx *ticket,
- void (*fn)(struct drm_gem_object *obj),
+with_vm_locks(void (*fn)(struct drm_gem_object *obj),
struct drm_gem_object *obj)
{
/*
@@ -52,7 +51,7 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
* success paths
*/
struct drm_gpuvm_bo *vm_bo, *last_locked = NULL;
- int ret = 0;
+ bool locked = true;
drm_gem_for_each_gpuvm_bo (vm_bo, obj) {
struct dma_resv *resv = drm_gpuvm_resv(vm_bo->vm);
@@ -60,23 +59,14 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
if (resv == obj->resv)
continue;
- ret = dma_resv_lock(resv, ticket);
-
/*
- * Since we already skip the case when the VM and obj
- * share a resv (ie. _NO_SHARE objs), we don't expect
- * to hit a double-locking scenario... which the lock
- * unwinding cannot really cope with.
+ * dma_resv_lock can't be used due to acquiring 'ticket' before the
+ * fs_reclaim lock, which is held in shrinker context
*/
- WARN_ON(ret == -EALREADY);
-
- /*
- * Don't bother with slow-lock / backoff / retry sequence,
- * if we can't get the lock just give up and move on to
- * the next object.
- */
- if (ret)
+ if (!dma_resv_trylock(resv)) {
+ locked = false;
goto out_unlock;
+ }
/*
* Hold a ref to prevent the vm_bo from being freed
@@ -108,7 +98,7 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
}
}
- return ret == 0;
+ return locked;
}
static bool
@@ -120,7 +110,7 @@ purge(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket)
if (msm_gem_active(obj))
return false;
- return with_vm_locks(ticket, msm_gem_purge, obj);
+ return with_vm_locks(msm_gem_purge, obj);
}
static bool
@@ -164,7 +154,6 @@ static unsigned long
msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
{
struct msm_drm_private *priv = shrinker->private_data;
- struct ww_acquire_ctx ticket;
struct {
struct drm_gem_lru *lru;
bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket);
@@ -185,11 +174,14 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
for (unsigned i = 0; (nr > 0) && (i < ARRAY_SIZE(stages)); i++) {
if (!stages[i].cond)
continue;
+ /*
+ * 'ticket' not needed on trylock paths
+ */
stages[i].freed =
drm_gem_lru_scan(stages[i].lru, nr,
&stages[i].remaining,
stages[i].shrink,
- &ticket);
+ NULL);
nr -= stages[i].freed;
freed += stages[i].freed;
remaining += stages[i].remaining;
--
2.53.0
next reply other threads:[~2026-05-08 6:57 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-08 6:57 Daniel J Blueman [this message]
2026-05-09 15:34 ` [PATCH] drm/msm: Fix shrinker deadlock Rob Clark
2026-05-11 7:37 ` Boris Brezillon
2026-05-11 13:16 ` Rob Clark
2026-05-10 12:30 ` kernel test robot
2026-05-10 17:08 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260508065722.18785-1-daniel@quora.org \
--to=daniel@quora.org \
--cc=abhinav.kumar@linux.dev \
--cc=airlied@gmail.com \
--cc=antomani103@gmail.com \
--cc=dri-devel@lists.freedesktop.org \
--cc=freedreno@lists.freedesktop.org \
--cc=jesszhan0024@gmail.com \
--cc=linux-arm-msm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=lumag@kernel.org \
--cc=marijn.suijten@somainline.org \
--cc=robin.clark@oss.qualcomm.com \
--cc=sean@poorly.run \
--cc=simona@ffwll.ch \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox