From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from stravinsky.debian.org (stravinsky.debian.org [82.195.75.108])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E8D936403B
	for <linux-kernel@vger.kernel.org>; Wed, 18 Mar 2026 11:31:18 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=82.195.75.108
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1773833480; cv=none; b=c9kCNPMq5COMA+M2R8jQpdW9EJl7eLMjuWLe1gosfRUh3V3jSNjB5Goj9BPKXqeJygSduc6ugc2k6Y/vVUtpvZv3UeyyZIFtbQ8ZfxWQKukWNZPOue4lk1qtUjrpv9z81VaGvQQbUtiwlIhxSen9zKN8fhnHyG/3T9DcdyFsyEg=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1773833480; c=relaxed/simple;
	bh=ZHoZsCE04ZtZVIbGm17N2QkPLn1y87IKLKHpKXrQqS0=;
	h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
	 Content-Type:Content-Disposition:In-Reply-To; b=sZLIHkvNnIOi1NVZmsmgJh4l/uXMIhVnd9RaRwUU/UE+ijYrIi7dn6bXxYNup9IPA922zdz6ny85oz0gqdkG8BuLSkvkVHyh8QyhLDU3JVq6x6duWuvvnsBUzUqFCKvLgzi6yeYkGB27Y3qrQsFECXmWUxyL5dNLLi5bPI7qYw8=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=debian.org; spf=none smtp.mailfrom=debian.org; dkim=pass (2048-bit key) header.d=debian.org header.i=@debian.org header.b=jO4TtYDk; arc=none smtp.client-ip=82.195.75.108
Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=debian.org
Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=debian.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=debian.org header.i=@debian.org header.b="jO4TtYDk"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=debian.org;
	s=smtpauto.stravinsky; h=X-Debian-User:In-Reply-To:Content-Transfer-Encoding:
	Content-Type:MIME-Version:References:Message-ID:Subject:Cc:To:From:Date:
	Reply-To:Content-ID:Content-Description;
	bh=QjcUDqhNECq7LVTQh3Hif5/hob14eLXDahPOrjTxDhE=; b=jO4TtYDkPuvLfMKdeeDmdEw/En
	+mtmmC2hheknAQZaDy6KFgHCM3Xv6XkK3/+z+08I1hYIyJlc/lyjBE0FqXttugpjG6Vz8SmPXKGc1
	1FZje8Cat8mf66xKtw9zJ8ZLDf5hSj3gxCjAJKXDlqJIUog0Rfk1DCBXjQS95v8azokK+yzqqxMvI
	5F2YXwXBSmhzYvwRhEUgZJbhHbAR1jvrUMZ4nJs7Kjeql387SwRCB+ctAMdUNCN5UETqODUfFi5O7
	5idCA9A2FhyzbNyoDdNHnXfWYPhhFFrckS9LFPosrdtcE20JN9DMKsRwYJbclJpg3ykwVb6s/31Ol
	R6Ug8K1Q==;
Received: from authenticated user
	by stravinsky.debian.org with esmtpsa (TLS1.3:ECDHE_X25519__RSA_PSS_RSAE_SHA256__AES_256_GCM:256)
	(Exim 4.94.2)
	(envelope-from <leitao@debian.org>)
	id 1w2p7K-003cxN-0X; Wed, 18 Mar 2026 11:31:13 +0000
Date: Wed, 18 Mar 2026 04:31:08 -0700
From: Breno Leitao <leitao@debian.org>
To: Petr Mladek <pmladek@suse.com>
Cc: Tejun Heo <tj@kernel.org>, Lai Jiangshan <jiangshanlai@gmail.com>, 
	Andrew Morton <akpm@linux-foundation.org>, linux-kernel@vger.kernel.org, 
	Omar Sandoval <osandov@osandov.com>, Song Liu <song@kernel.org>, 
	Danielle Costantino <dcostantino@meta.com>, kasan-dev@googlegroups.com, kernel-team@meta.com
Subject: Re: [PATCH v2 4/5] workqueue: Show all busy workers in stall
 diagnostics
Message-ID: <abqLMy-ktHXRRQ1C@gmail.com>
References: <20260305-wqstall_start-at-v2-0-b60863ee0899@debian.org>
 <20260305-wqstall_start-at-v2-4-b60863ee0899@debian.org>
 <abLxx2cFdBFUQx5V@pathway.suse.cz>
 <abQJY3EBElumYpCj@gmail.com>
 <abQ6_FsxtfH8nXka@pathway>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <abQ6_FsxtfH8nXka@pathway>
X-Debian-User: leitao

Hello Petr,

On Fri, Mar 13, 2026 at 05:27:40PM +0100, Petr Mladek wrote:
> I took inspiration from your patch. This is what comes to my mind
> on top of the current master (printing only running workers):
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index aeaec79bc09c..a044c7e42139 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -7588,12 +7588,15 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
>  {
>  	struct worker *worker;
>  	unsigned long irq_flags;
> +	bool found_running;
>  	int bkt;
>
>  	raw_spin_lock_irqsave(&pool->lock, irq_flags);
>
> +	found_running = false;
>  	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
>  		if (task_is_running(worker->task)) {
> +			found_running = true;
>  			/*
>  			 * Defer printing to avoid deadlocks in console
>  			 * drivers that queue work while holding locks
> @@ -7609,6 +7612,19 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
>  	}
>
>  	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
> +
> +	if (!found_running) {
> +		pr_info("pool %d: no worker in running state, cpu=%d is %s (nr_workers=%d nr_idle=%d)\n",
> +			pool->id, pool->cpu,
> +			idle_cpu(pool->cpu) ? "idle" : "busy",
> +			pool->nr_workers, pool->nr_idle);
> +		pr_info("The pool might have troubles to wake up another idle worker.\n");
> +		if (pool->manager) {
> +			pr_info("Backtrace of the pool manager:\n");
> +			sched_show_task(pool->manager->task);
> +		}
> +		trigger_single_cpu_backtrace(pool->cpu);
> +	}
>  }
>
>  static void show_cpu_pools_hogs(void)
>
>
> Warning: The code is not safe. We would need add some synchronization
> 	 of the pool->manager pointer.
>
> 	Even better might be to print state and backtrace of the process
> 	which was woken by kick_pool() when the last running worker
> 	went asleep.

I agree. We should probably store the last woken worker in the worker_pool
structure and print it later.

I've spent some time verifying that the locking and lifecycle management are
correct. While I'm not completely certain, I believe it's getting closer. An
extra pair of eyes would be helpful.

This is the new version of this patch:

commit feccca7e696ead3272669ee4d4dc02b6946d0faf
Author: Breno Leitao <leitao@debian.org>
Date:   Mon Mar 16 09:47:09 2026 -0700

    workqueue: print diagnostic info when no worker is in running state
    
    show_cpu_pool_busy_workers() iterates over busy workers but gives no
    feedback when none are found in running state, which is a key indicator
    that a pool may be stuck — unable to wake an idle worker to process
    pending work.
    
    Add a diagnostic message when no running workers are found, reporting
    pool id, CPU, idle state, and worker counts.  Also trigger a single-CPU
    backtrace for the stalled CPU.
    
    To identify the task most likely responsible for the stall, add
    last_woken_worker (L: pool->lock) to worker_pool and record it in
    kick_pool() just before wake_up_process().  This captures the idle
    worker that was kicked to take over when the last running worker went to
    sleep; if the pool is now stuck with no running worker, that task is the
    prime suspect and its backtrace is dumped.
    
    Using struct worker * rather than struct task_struct * avoids any
    lifetime concern: workers are only destroyed via set_worker_dying()
    which requires pool->lock, and set_worker_dying() clears
    last_woken_worker when the dying worker matches.  show_cpu_pool_busy_workers()
    holds pool->lock while calling sched_show_task(), so last_woken_worker
    is either NULL or points to a live worker with a valid task.  More
    precisely, set_worker_dying() clears last_woken_worker before setting
    WORKER_DIE, so a non-NULL last_woken_worker means the kthread has not
    yet exited and worker->task is still alive.
    
    The pool info message is printed inside pool->lock using
    printk_deferred_enter/exit, the same pattern used by the existing
    busy-worker loop, to avoid deadlocks with console drivers that queue
    work while holding locks also taken in their write paths.
    trigger_single_cpu_backtrace() is called after releasing the lock.
    
    Suggested-by: Petr Mladek <pmladek@suse.com>
    Signed-off-by: Breno Leitao <leitao@debian.org>

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b77119d71641a..38aebf4514c03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -217,6 +217,7 @@ struct worker_pool {
 						/* L: hash of busy workers */
 
 	struct worker		*manager;	/* L: purely informational */
+	struct worker		*last_woken_worker; /* L: last worker woken by kick_pool() */
 	struct list_head	workers;	/* A: attached workers */
 
 	struct ida		worker_ida;	/* worker IDs for task name */
@@ -1295,6 +1296,9 @@ static bool kick_pool(struct worker_pool *pool)
 		}
 	}
 #endif
+	/* Track the last idle worker woken, used for stall diagnostics. */
+	pool->last_woken_worker = worker;
+
 	wake_up_process(p);
 	return true;
 }
@@ -2902,6 +2906,13 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
 	pool->nr_workers--;
 	pool->nr_idle--;
 
+	/*
+	 * Clear last_woken_worker if it points to this worker, so that
+	 * show_cpu_pool_busy_workers() cannot dereference a freed worker.
+	 */
+	if (pool->last_woken_worker == worker)
+		pool->last_woken_worker = NULL;
+
 	worker->flags |= WORKER_DIE;
 
 	list_move(&worker->entry, list);
@@ -7582,20 +7593,58 @@ module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
 MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
 
 /*
- * Show workers that might prevent the processing of pending work items.
- * A busy worker that is not running on the CPU (e.g. sleeping in
- * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
- * effectively as a CPU-bound one, so dump every in-flight worker.
+ * Report that a pool has no worker in running state, which is a sign that the
+ * pool may be stuck. Print pool info. Must be called with pool->lock held and
+ * inside a printk_deferred_enter/exit region.
+ */
+static void show_pool_no_running_worker(struct worker_pool *pool)
+{
+	lockdep_assert_held(&pool->lock);
+
+	printk_deferred_enter();
+	pr_info("pool %d: no worker in running state, cpu=%d is %s (nr_workers=%d nr_idle=%d)\n",
+		pool->id, pool->cpu,
+		idle_cpu(pool->cpu) ? "idle" : "busy",
+		pool->nr_workers, pool->nr_idle);
+	pr_info("The pool might have trouble waking an idle worker.\n");
+	/*
+	 * last_woken_worker and its task are valid here: set_worker_dying()
+	 * clears it under pool->lock before setting WORKER_DIE, so if
+	 * last_woken_worker is non-NULL the kthread has not yet exited and
+	 * worker->task is still alive.
+	 */
+	if (pool->last_woken_worker) {
+		pr_info("Backtrace of last woken worker:\n");
+		sched_show_task(pool->last_woken_worker->task);
+	} else {
+		pr_info("Last woken worker empty\n");
+	}
+	printk_deferred_exit();
+}
+
+/*
+ * Show running workers that might prevent the processing of pending work items.
+ * If no running worker is found, the pool may be stuck waiting for an idle
+ * worker to be woken, so report the pool state and the last woken worker.
  */
 static void show_cpu_pool_busy_workers(struct worker_pool *pool)
 {
 	struct worker *worker;
 	unsigned long irq_flags;
-	int bkt;
+	bool found_running = false;
+	int cpu, bkt;
 
 	raw_spin_lock_irqsave(&pool->lock, irq_flags);
 
+	/* Snapshot cpu inside the lock to safely use it after unlock. */
+	cpu = pool->cpu;
+
 	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+		/* Skip workers that are not actively running on the CPU. */
+		if (!task_is_running(worker->task))
+			continue;
+
+		found_running = true;
 		/*
 		 * Defer printing to avoid deadlocks in console
 		 * drivers that queue work while holding locks
@@ -7609,7 +7658,23 @@ static void show_cpu_pool_busy_workers(struct worker_pool *pool)
 		printk_deferred_exit();
 	}
 
+	/*
+	 * If no running worker was found, the pool is likely stuck. Print pool
+	 * state and the backtrace of the last woken worker, which is the prime
+	 * suspect for the stall.
+	 */
+	if (!found_running)
+		show_pool_no_running_worker(pool);
+
 	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
+
+	/*
+	 * Trigger a backtrace on the stalled CPU to capture what it is
+	 * currently executing. Called after releasing the lock to avoid
+	 * any potential issues with NMI delivery.
+	 */
+	if (!found_running)
+		trigger_single_cpu_backtrace(cpu);
 }
 
 static void show_cpu_pools_busy_workers(void)