public inbox for linux-trace-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Aaron Tomlin <atomlin@atomlin.com>
To: axboe@kernel.dk, rostedt@goodmis.org, mhiramat@kernel.org,
	mathieu.desnoyers@efficios.com
Cc: johannes.thumshirn@wdc.com, kch@nvidia.com, bvanassche@acm.org,
	dlemoal@kernel.org, ritesh.list@gmail.com, loberman@redhat.com,
	neelx@suse.com, sean@ashe.io, mproche@gmail.com,
	chjohnst@gmail.com, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Subject: [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs
Date: Thu, 19 Mar 2026 18:19:56 -0400	[thread overview]
Message-ID: <20260319221956.332770-3-atomlin@atomlin.com> (raw)
In-Reply-To: <20260319221956.332770-1-atomlin@atomlin.com>

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices are starved of available
tags.

This patch introduces two new debugfs attributes for each block
hardware queue:
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag

These files expose atomic counters that increment each time a submitting
context is forced into an uninterruptible sleep via io_schedule() due to
the complete exhaustion of physical driver tags or software scheduler
tags, respectively.

To guarantee zero performance overhead for production kernels compiled
without debugfs, the underlying atomic_t variables and their associated
increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS.
When this configuration is disabled, the tracking logic compiles down
to a safe no-op.

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 block/blk-mq-debugfs.c | 56 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h |  7 ++++++
 block/blk-mq-tag.c     |  4 +++
 include/linux/blk-mq.h | 10 ++++++++
 4 files changed, 77 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 28167c9baa55..078561d7da38 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+/**
+ * hctx_wait_on_hw_tag_show - display hardware tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of physical hardware driver tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag));
+	return 0;
+}
+
+/**
+ * hctx_wait_on_sched_tag_show - display scheduler tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of software scheduler tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_sched_tag));
+	return 0;
+}
+
 #define CTX_RQ_SEQ_OPS(name, type)					\
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
 	__acquires(&ctx->lock)						\
@@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{"type", 0400, hctx_type_show},
+	{"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
+	{"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
 	{},
 };
 
@@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 	debugfs_remove_recursive(hctx->sched_debugfs_dir);
 	hctx->sched_debugfs_dir = NULL;
 }
+
+/**
+ * blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
+ * @hctx: hardware context associated with the tag allocation
+ * @is_sched: boolean indicating whether the starved pool is the software scheduler
+ *
+ * Evaluates the exhausted tag pool and increments the appropriate debugfs
+ * starvation counter. This is invoked immediately before the submitting
+ * context is forced into an uninterruptible sleep via io_schedule().
+ */
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+				  bool is_sched)
+{
+	if (is_sched)
+		atomic_inc(&hctx->wait_on_sched_tag);
+	else
+		atomic_inc(&hctx->wait_on_hw_tag);
+}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 49bb1aaa83dc..2cda555d5730 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+				  bool is_sched);
 #else
 static inline void blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -77,6 +79,11 @@ static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
 {
 }
 
+static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+						bool is_sched)
+{
+}
+
 #endif
 
 #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 66138dd043d4..3cc6a97a87a0 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -17,6 +17,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
 
 /*
  * Recalculate wakeup batch when tag is shared by hctx.
@@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		trace_block_rq_tag_wait(data->q, data->hctx,
 					data->rq_flags & RQF_SCHED_TAGS);
 
+		blk_mq_debugfs_inc_wait_tags(data->hctx,
+					     data->rq_flags & RQF_SCHED_TAGS);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..f3d8ea93b23f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -453,6 +453,16 @@ struct blk_mq_hw_ctx {
 	struct dentry		*debugfs_dir;
 	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
 	struct dentry		*sched_debugfs_dir;
+	/**
+	 * @wait_on_hw_tag: Cumulative counter incremented each time a submitting
+	 * context is forced to block due to physical hardware driver tag exhaustion.
+	 */
+	atomic_t		wait_on_hw_tag;
+	/**
+	 * @wait_on_sched_tag: Cumulative counter incremented each time a submitting
+	 * context is forced to block due to software scheduler tag exhaustion.
+	 */
+	atomic_t		wait_on_sched_tag;
 #endif
 
 	/**
-- 
2.51.0


  parent reply	other threads:[~2026-03-19 22:20 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-19 22:19 [PATCH v3 0/2] blk-mq: introduce tag starvation observability Aaron Tomlin
2026-03-19 22:19 ` [PATCH v3 1/2] blk-mq: add tracepoint block_rq_tag_wait Aaron Tomlin
2026-03-19 22:19 ` Aaron Tomlin [this message]
2026-03-20 15:08   ` [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs Laurence Oberman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260319221956.332770-3-atomlin@atomlin.com \
    --to=atomlin@atomlin.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=chjohnst@gmail.com \
    --cc=dlemoal@kernel.org \
    --cc=johannes.thumshirn@wdc.com \
    --cc=kch@nvidia.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=loberman@redhat.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=mproche@gmail.com \
    --cc=neelx@suse.com \
    --cc=ritesh.list@gmail.com \
    --cc=rostedt@goodmis.org \
    --cc=sean@ashe.io \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox