The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* [PATCH v2] workqueue: dump the last woken worker for stalled pools
@ 2026-07-01 11:05 Breno Leitao
  2026-07-01 18:01 ` Tejun Heo
  0 siblings, 1 reply; 2+ messages in thread
From: Breno Leitao @ 2026-07-01 11:05 UTC (permalink / raw)
  To: Tejun Heo, Lai Jiangshan
  Cc: linux-kernel, david.dai, kernel-team, Petr Mladek, Breno Leitao

To identify the task most likely responsible for a stall, add
last_woken_worker (L: pool->lock) to worker_pool and record it in
kick_pool() just before wake_up_process().  This captures the idle
worker that was kicked to take over when the last running worker went to
sleep; if the pool is now stuck with no running worker, that task is the
prime suspect and its backtrace is dumped by show_pool_no_running_worker().

Using struct worker * rather than struct task_struct * avoids any
lifetime concern: workers are only destroyed via set_worker_dying()
which requires pool->lock, and set_worker_dying() clears
last_woken_worker when the dying worker matches.
show_cpu_pool_busy_workers() holds pool->lock while calling
sched_show_task(), so last_woken_worker is either NULL or points to a
live worker with a valid task.  More precisely, set_worker_dying() clears
last_woken_worker before setting WORKER_DIE, so a non-NULL
last_woken_worker means the kthread has not yet exited and worker->task
is still alive.

Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes from v1:
Rebased on top of tj-wq/for-7.3
v1 link:
https://lore.kernel.org/all/20260630-wq_dump_petr-v2-3-c944cec38fc3@debian.org/
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3aea405d7735..86b6e43d41b52 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -226,6 +226,8 @@ struct worker_pool {
 						/* L: hash of busy workers */
 
 	struct worker		*manager;	/* L: purely informational */
+	/* L: last worker woken by kick_pool() */
+	struct worker		*last_woken_worker;
 	struct list_head	workers;	/* A: attached workers */
 
 	struct ida		worker_ida;	/* worker IDs for task name */
@@ -1318,6 +1320,9 @@ static bool kick_pool_pick(struct worker_pool *pool, struct task_struct **wakep)
 		}
 	}
 #endif
+	/* Track the last idle worker woken, used for stall diagnostics. */
+	pool->last_woken_worker = worker;
+
 	*wakep = p;
 	return true;
 }
@@ -2976,6 +2981,13 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
 	pool->nr_workers--;
 	pool->nr_idle--;
 
+	/*
+	 * Clear last_woken_worker if it points to this worker, so that
+	 * show_cpu_pool_busy_workers() cannot dereference a freed worker.
+	 */
+	if (pool->last_woken_worker == worker)
+		pool->last_woken_worker = NULL;
+
 	worker->flags |= WORKER_DIE;
 
 	list_move(&worker->entry, list);
@@ -7740,13 +7752,25 @@ static void show_pool_no_running_worker(struct worker_pool *pool)
 		idle_cpu(pool->cpu) ? "idle" : "busy",
 		pool->nr_workers, pool->nr_idle);
 	pr_info("The pool might have trouble waking an idle worker.\n");
+	/*
+	 * last_woken_worker and its task are valid here: set_worker_dying()
+	 * clears it under pool->lock before setting WORKER_DIE, so if
+	 * last_woken_worker is non-NULL the kthread has not yet exited and
+	 * worker->task is still alive.
+	 */
+	if (pool->last_woken_worker) {
+		pr_info("Backtrace of last woken worker:\n");
+		sched_show_task(pool->last_woken_worker->task);
+	} else {
+		pr_info("Last woken worker empty\n");
+	}
 	printk_deferred_exit();
 }
 
 /*
  * Show running workers that might prevent the processing of pending work items.
  * If no running worker is found, the pool may be stuck waiting for an idle
- * worker to be woken, so report the pool state.
+ * worker to be woken, so report the pool state and the last woken worker.
  */
 static void show_cpu_pool_busy_workers(struct worker_pool *pool)
 {
@@ -7781,7 +7805,8 @@ static void show_cpu_pool_busy_workers(struct worker_pool *pool)
 
 	/*
 	 * If no running worker was found, the pool is likely stuck. Print pool
-	 * state.
+	 * state and the backtrace of the last woken worker, which is the prime
+	 * suspect for the stall.
 	 */
 	if (!found_running)
 		show_pool_no_running_worker(pool);

---
base-commit: f7dc93388946dacae5ddf6bdf55822f066798a40
change-id: 20260701-rest_petr-2e18ae0b3ed0

Best regards,
-- 
Breno Leitao <leitao@debian.org>


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-07-01 18:02 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-01 11:05 [PATCH v2] workqueue: dump the last woken worker for stalled pools Breno Leitao
2026-07-01 18:01 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox