* [PATCH v6 12/14] block/Kyber: Make the lock context annotations compatible with Clang
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
While sparse ignores the __acquires() and __releases() arguments, Clang
verifies these. Make the arguments of __acquires() and __releases()
acceptable for Clang.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/kyber-iosched.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b84163d1f851..971818bcdc9d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -882,6 +882,9 @@ static const struct elv_fs_entry kyber_sched_attrs[] = {
};
#undef KYBER_LAT_ATTR
+#define HCTX_FROM_SEQ_FILE(m) ((struct blk_mq_hw_ctx *)(m)->private)
+#define KYBER_HCTX_DATA(hctx) ((struct kyber_hctx_data *)(hctx)->sched_data)
+
#ifdef CONFIG_BLK_DEBUG_FS
#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
@@ -894,7 +897,7 @@ static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
} \
\
static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
- __acquires(&khd->lock) \
+ __acquires(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
@@ -913,7 +916,7 @@ static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
} \
\
static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
- __releases(&khd->lock) \
+ __releases(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
^ permalink raw reply related
* [PATCH v6 11/14] block/blk-mq-debugfs: Improve lock context annotations
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Make the existing lock context annotations compatible with Clang. Add
the lock context annotations that are missing.
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-mq-debugfs.c | 24 ++++++++++++++++++------
block/blk.h | 4 ++++
2 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..6754d8f9449c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -19,8 +19,10 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
return 0;
}
+#define TO_REQUEST_QUEUE(m) ((struct request_queue *)m->private)
+
static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
- __acquires(&q->requeue_lock)
+ __acquires(&TO_REQUEST_QUEUE(m)->requeue_lock)
{
struct request_queue *q = m->private;
@@ -36,13 +38,15 @@ static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
}
static void queue_requeue_list_stop(struct seq_file *m, void *v)
- __releases(&q->requeue_lock)
+ __releases(&TO_REQUEST_QUEUE(m)->requeue_lock)
{
struct request_queue *q = m->private;
spin_unlock_irq(&q->requeue_lock);
}
+#undef TO_REQUEST_QUEUE
+
static const struct seq_operations queue_requeue_list_seq_ops = {
.start = queue_requeue_list_start,
.next = queue_requeue_list_next,
@@ -297,8 +301,10 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
}
EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
+#define TO_HCTX(m) ((struct blk_mq_hw_ctx *)m->private)
+
static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&hctx->lock)
+ __acquires(&TO_HCTX(m)->lock)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -314,13 +320,15 @@ static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
}
static void hctx_dispatch_stop(struct seq_file *m, void *v)
- __releases(&hctx->lock)
+ __releases(&TO_HCTX(m)->lock)
{
struct blk_mq_hw_ctx *hctx = m->private;
spin_unlock(&hctx->lock);
}
+#undef TO_HCTX
+
static const struct seq_operations hctx_dispatch_seq_ops = {
.start = hctx_dispatch_start,
.next = hctx_dispatch_next,
@@ -484,9 +492,11 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
return 0;
}
+#define TO_CTX(m) ((struct blk_mq_ctx *)m->private)
+
#define CTX_RQ_SEQ_OPS(name, type) \
static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
- __acquires(&ctx->lock) \
+ __acquires(&TO_CTX(m)->lock) \
{ \
struct blk_mq_ctx *ctx = m->private; \
\
@@ -503,7 +513,7 @@ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \
} \
\
static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \
- __releases(&ctx->lock) \
+ __releases(&TO_CTX(m)->lock) \
{ \
struct blk_mq_ctx *ctx = m->private; \
\
@@ -521,6 +531,8 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
+#undef TO_CTX
+
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
diff --git a/block/blk.h b/block/blk.h
index bf1a80493ff1..1a2d9101bba0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -756,16 +756,19 @@ static inline void blk_unfreeze_release_lock(struct request_queue *q)
* reclaim from triggering block I/O.
*/
static inline void blk_debugfs_lock_nomemsave(struct request_queue *q)
+ __acquires(&q->debugfs_mutex)
{
mutex_lock(&q->debugfs_mutex);
}
static inline void blk_debugfs_unlock_nomemrestore(struct request_queue *q)
+ __releases(&q->debugfs_mutex)
{
mutex_unlock(&q->debugfs_mutex);
}
static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q)
+ __acquires(&q->debugfs_mutex)
{
unsigned int memflags = memalloc_noio_save();
@@ -775,6 +778,7 @@ static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q
static inline void blk_debugfs_unlock(struct request_queue *q,
unsigned int memflags)
+ __releases(&q->debugfs_mutex)
{
blk_debugfs_unlock_nomemrestore(q);
memalloc_noio_restore(memflags);
^ permalink raw reply related
* [PATCH v6 09/14] block/blk-iocost: Split ioc_rqos_throttle()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Prepare for inlining iocg_lock() and iocg_unlock() by moving the code
between these two calls into a new function. No functionality has been
changed.
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-iocost.c | 163 ++++++++++++++++++++++++++-------------------
1 file changed, 94 insertions(+), 69 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 302388e99588..8f1468444e97 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2614,6 +2614,88 @@ static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
return cost;
}
+enum over_budget_action {
+ action_retry,
+ action_commit,
+ action_wait,
+ action_return,
+};
+
+static enum over_budget_action
+iocg_handle_over_budget(struct rq_qos *rqos, struct ioc_gq *iocg,
+ struct bio *bio, struct ioc_now *now,
+ struct iocg_wait *wait, bool use_debt, bool ioc_locked,
+ u64 abs_cost, u64 cost)
+{
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /*
+ * @iocg must stay activated for debt and waitq handling. Deactivation
+ * is synchronized against both ioc->lock and waitq.lock and we won't
+ * get deactivated as long as we're waiting or have debt, so we're good
+ * if we're activated here. In the unlikely cases that we aren't, just
+ * issue the IO.
+ */
+ if (unlikely(list_empty(&iocg->active_list)))
+ return action_commit;
+
+ /*
+ * We're over budget. If @bio has to be issued regardless, remember
+ * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
+ * off the debt before waking more IOs.
+ *
+ * This way, the debt is continuously paid off each period with the
+ * actual budget available to the cgroup. If we just wound vtime, we
+ * would incorrectly use the current hw_inuse for the entire amount
+ * which, for example, can lead to the cgroup staying blocked for a
+ * long time even with substantially raised hw_inuse.
+ *
+ * An iocg with vdebt should stay online so that the timer can keep
+ * deducting its vdebt and [de]activate use_delay mechanism
+ * accordingly. We don't want to race against the timer trying to
+ * clear them and leave @iocg inactive w/ dangling use_delay heavily
+ * penalizing the cgroup and its descendants.
+ */
+ if (use_debt) {
+ iocg_incur_debt(iocg, abs_cost, now);
+ if (iocg_kick_delay(iocg, now))
+ blkcg_schedule_throttle(rqos->disk,
+ (bio->bi_opf & REQ_SWAP) ==
+ REQ_SWAP);
+ return action_return;
+ }
+
+ /* guarantee that iocgs w/ waiters have maximum inuse */
+ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
+ if (!ioc_locked)
+ return action_retry;
+ lockdep_assert_held(&iocg->ioc->lock);
+ propagate_weights(iocg, iocg->active, iocg->active, true, now);
+ }
+
+ /*
+ * Append self to the waitq and schedule the wakeup timer if we're
+ * the first waiter. The timer duration is calculated based on the
+ * current vrate. vtime and hweight changes can make it too short
+ * or too long. Each wait entry records the absolute cost it's
+ * waiting for to allow re-evaluation using a custom wait entry.
+ *
+ * If too short, the timer simply reschedules itself. If too long,
+ * the period timer will notice and trigger wakeups.
+ *
+ * All waiters are on iocg->waitq and the wait states are
+ * synchronized using waitq.lock.
+ */
+ init_wait_func(&wait->wait, iocg_wake_fn);
+ wait->bio = bio;
+ wait->abs_cost = abs_cost;
+ wait->committed = false; /* will be set true by waker */
+
+ __add_wait_queue_entry_tail(&iocg->waitq, &wait->wait);
+ iocg_kick_waitq(iocg, ioc_locked, now);
+ return action_wait;
+}
+
static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
@@ -2623,6 +2705,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
struct iocg_wait wait;
u64 abs_cost, cost, vtime;
bool use_debt, ioc_locked;
+ enum over_budget_action action;
unsigned long flags;
/* bypass IOs if disabled, still initializing, or for root cgroup */
@@ -2663,80 +2746,22 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
retry_lock:
iocg_lock(iocg, ioc_locked, &flags);
-
- /*
- * @iocg must stay activated for debt and waitq handling. Deactivation
- * is synchronized against both ioc->lock and waitq.lock and we won't
- * get deactivated as long as we're waiting or has debt, so we're good
- * if we're activated here. In the unlikely cases that we aren't, just
- * issue the IO.
- */
- if (unlikely(list_empty(&iocg->active_list))) {
- iocg_unlock(iocg, ioc_locked, &flags);
+ action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait, use_debt,
+ ioc_locked, abs_cost, cost);
+ iocg_unlock(iocg, ioc_locked, &flags);
+ switch (action) {
+ case action_retry:
+ ioc_locked = true;
+ goto retry_lock;
+ case action_commit:
iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
- }
-
- /*
- * We're over budget. If @bio has to be issued regardless, remember
- * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
- * off the debt before waking more IOs.
- *
- * This way, the debt is continuously paid off each period with the
- * actual budget available to the cgroup. If we just wound vtime, we
- * would incorrectly use the current hw_inuse for the entire amount
- * which, for example, can lead to the cgroup staying blocked for a
- * long time even with substantially raised hw_inuse.
- *
- * An iocg with vdebt should stay online so that the timer can keep
- * deducting its vdebt and [de]activate use_delay mechanism
- * accordingly. We don't want to race against the timer trying to
- * clear them and leave @iocg inactive w/ dangling use_delay heavily
- * penalizing the cgroup and its descendants.
- */
- if (use_debt) {
- iocg_incur_debt(iocg, abs_cost, &now);
- if (iocg_kick_delay(iocg, &now))
- blkcg_schedule_throttle(rqos->disk,
- (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
- iocg_unlock(iocg, ioc_locked, &flags);
+ case action_return:
return;
+ case action_wait:
+ break;
}
- /* guarantee that iocgs w/ waiters have maximum inuse */
- if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
- if (!ioc_locked) {
- iocg_unlock(iocg, false, &flags);
- ioc_locked = true;
- goto retry_lock;
- }
- propagate_weights(iocg, iocg->active, iocg->active, true,
- &now);
- }
-
- /*
- * Append self to the waitq and schedule the wakeup timer if we're
- * the first waiter. The timer duration is calculated based on the
- * current vrate. vtime and hweight changes can make it too short
- * or too long. Each wait entry records the absolute cost it's
- * waiting for to allow re-evaluation using a custom wait entry.
- *
- * If too short, the timer simply reschedules itself. If too long,
- * the period timer will notice and trigger wakeups.
- *
- * All waiters are on iocg->waitq and the wait states are
- * synchronized using waitq.lock.
- */
- init_wait_func(&wait.wait, iocg_wake_fn);
- wait.bio = bio;
- wait.abs_cost = abs_cost;
- wait.committed = false; /* will be set true by waker */
-
- __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
- iocg_kick_waitq(iocg, ioc_locked, &now);
-
- iocg_unlock(iocg, ioc_locked, &flags);
-
while (true) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (wait.committed)
^ permalink raw reply related
* [PATCH v6 04/14] block/cgroup: Split blkg_conf_exit()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik, Yu Kuai,
Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Split blkg_conf_exit() into blkg_conf_unprep() and blkg_conf_close_bdev()
because blkg_conf_exit() is not compatible with the Clang thread-safety
annotations. Remove blkg_conf_exit(). Rename blkg_conf_exit_frozen() into
blkg_conf_close_bdev_frozen(). Add thread-safety annotations to the new
functions.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/bfq-cgroup.c | 9 ++++--
block/blk-cgroup.c | 57 ++++++++++++++++++------------------
block/blk-cgroup.h | 6 ++--
block/blk-iocost.c | 67 +++++++++++++++++++++----------------------
block/blk-iolatency.c | 19 ++++++------
block/blk-throttle.c | 34 +++++++++++++---------
6 files changed, 101 insertions(+), 91 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index df7b5a646e96..0bd0332b3d78 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1054,11 +1054,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out;
+ return ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
if (ret)
- goto out;
+ goto close_bdev;
if (sscanf(ctx.body, "%llu", &v) == 1) {
/* require "default" on dfl */
@@ -1079,8 +1079,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
ret = 0;
}
+
out:
- blkg_conf_exit(&ctx);
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a8d95d51b866..38d7bcfcbbe8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -755,7 +755,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
*
* Initialize @ctx which can be used to parse blkg config input string @input.
* Once initialized, @ctx can be used with blkg_conf_open_bdev() and
- * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
+ * blkg_conf_prep().
*/
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
@@ -817,8 +817,8 @@ EXPORT_SYMBOL_GPL(blkg_conf_open_bdev);
* ensures the correct locking order between freeze queue and q->rq_qos_mutex.
*
* This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to blkg_conf_exit_frozen
- * for restoring the memalloc scope.
+ * memflags which must be saved and later passed to
+ * blkg_conf_close_bdev_frozen() for restoring the memalloc scope.
*/
unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
{
@@ -858,7 +858,7 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
*
* blkg_conf_open_bdev() must be called on @ctx beforehand. On success, this
* function returns with queue lock held and must be followed by
- * blkg_conf_exit().
+ * blkg_conf_close_bdev().
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx)
@@ -968,42 +968,41 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
- * blkg_conf_exit - clean up per-blkg config update
+ * blkg_conf_unprep - counterpart of blkg_conf_prep()
* @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Clean up after per-blkg config update. This function must be called on all
- * blkg_conf_ctx's initialized with blkg_conf_init().
*/
-void blkg_conf_exit(struct blkg_conf_ctx *ctx)
- __releases(&ctx->bdev->bd_queue->queue_lock)
- __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
{
- if (ctx->blkg) {
- spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
- ctx->blkg = NULL;
- }
+ WARN_ON_ONCE(!ctx->blkg);
+ spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+ ctx->blkg = NULL;
+}
+EXPORT_SYMBOL_GPL(blkg_conf_unprep);
- if (ctx->bdev) {
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
- blkdev_put_no_open(ctx->bdev);
- ctx->body = NULL;
- ctx->bdev = NULL;
- }
+/**
+ * blkg_conf_close_bdev - counterpart of blkg_conf_open_bdev()
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
+ */
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
+{
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+ blkdev_put_no_open(ctx->bdev);
+ ctx->body = NULL;
+ ctx->bdev = NULL;
}
-EXPORT_SYMBOL_GPL(blkg_conf_exit);
+EXPORT_SYMBOL_GPL(blkg_conf_close_bdev);
/*
- * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
+ * Similar to blkg_close_bdev, but also unfreezes the queue. Should be used
* when blkg_conf_open_bdev_frozen is used to open the bdev.
*/
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
+void blkg_conf_close_bdev_frozen(struct blkg_conf_ctx *ctx,
+ unsigned long memflags)
{
- if (ctx->bdev) {
- struct request_queue *q = ctx->bdev->bd_queue;
+ struct request_queue *q = ctx->bdev->bd_queue;
- blkg_conf_exit(ctx);
- blk_mq_unfreeze_queue(q, memflags);
- }
+ blkg_conf_close_bdev(ctx);
+ blk_mq_unfreeze_queue(q, memflags);
}
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cce3294634d..ce90f5b60d52 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -222,8 +222,10 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
-void blkg_conf_exit(struct blkg_conf_ctx *ctx);
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx);
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx);
+void blkg_conf_close_bdev_frozen(struct blkg_conf_ctx *ctx,
+ unsigned long memflags);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index b34f820dedcc..3e4e28ecc21f 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3142,21 +3142,23 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto err;
+ return ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
if (ret)
- goto err;
+ goto close_bdev;
iocg = blkg_to_iocg(ctx.blkg);
+ ret = -EINVAL;
+
if (!strncmp(ctx.body, "default", 7)) {
v = 0;
} else {
if (!sscanf(ctx.body, "%u", &v))
- goto einval;
+ goto unprep;
if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
- goto einval;
+ goto unprep;
}
spin_lock(&iocg->ioc->lock);
@@ -3165,14 +3167,15 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
weight_updated(iocg, &now);
spin_unlock(&iocg->ioc->lock);
- blkg_conf_exit(&ctx);
- return nbytes;
+ ret = 0;
-einval:
- ret = -EINVAL;
-err:
- blkg_conf_exit(&ctx);
- return ret;
+unprep:
+ blkg_conf_unprep(&ctx);
+
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
+
+ return ret ?: nbytes;
}
static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@ -3241,10 +3244,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blkg_conf_init(&ctx, input);
memflags = blkg_conf_open_bdev_frozen(&ctx);
- if (IS_ERR_VALUE(memflags)) {
- ret = memflags;
- goto err;
- }
+ if (IS_ERR_VALUE(memflags))
+ return memflags;
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3361,14 +3362,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blk_mq_unquiesce_queue(disk->queue);
- blkg_conf_exit_frozen(&ctx, memflags);
+ blkg_conf_close_bdev_frozen(&ctx, memflags);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
ret = -EINVAL;
err:
- blkg_conf_exit_frozen(&ctx, memflags);
+ blkg_conf_close_bdev_frozen(&ctx, memflags);
return ret;
}
@@ -3434,20 +3435,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto err;
+ return ret;
body = ctx.body;
q = bdev_get_queue(ctx.bdev);
if (!queue_is_mq(q)) {
ret = -EOPNOTSUPP;
- goto err;
+ goto close_bdev;
}
ioc = q_to_ioc(q);
if (!ioc) {
ret = blk_iocost_init(ctx.bdev->bd_disk);
if (ret)
- goto err;
+ goto close_bdev;
ioc = q_to_ioc(q);
}
@@ -3458,6 +3459,8 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
+ ret = -EINVAL;
+
while ((p = strsep(&body, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
char buf[32];
@@ -3475,20 +3478,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
else if (!strcmp(buf, "user"))
user = true;
else
- goto einval;
+ goto unlock;
continue;
case COST_MODEL:
match_strlcpy(buf, &args[0], sizeof(buf));
if (strcmp(buf, "linear"))
- goto einval;
+ goto unlock;
continue;
}
tok = match_token(p, i_lcoef_tokens, args);
if (tok == NR_I_LCOEFS)
- goto einval;
+ goto unlock;
if (match_u64(&args[0], &v))
- goto einval;
+ goto unlock;
u[tok] = v;
user = true;
}
@@ -3500,24 +3503,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc->user_cost_model = false;
}
ioc_refresh_params(ioc, true);
- spin_unlock_irq(&ioc->lock);
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q, memflags);
-
- blkg_conf_exit(&ctx);
- return nbytes;
+ ret = 0;
-einval:
+unlock:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q, memflags);
- ret = -EINVAL;
-err:
- blkg_conf_exit(&ctx);
- return ret;
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
+ return ret ?: nbytes;
}
static struct cftype ioc_files[] = {
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 53e8dd2dfa8a..1aaee6fb0f59 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -840,7 +840,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out;
+ return ret;
/*
* blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -850,11 +850,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
if (!iolat_rq_qos(ctx.bdev->bd_queue))
ret = blk_iolatency_init(ctx.bdev->bd_disk);
if (ret)
- goto out;
+ goto close_bdev;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
if (ret)
- goto out;
+ goto close_bdev;
iolat = blkg_to_lat(ctx.blkg);
p = ctx.body;
@@ -865,7 +865,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
char val[21]; /* 18446744073709551616 */
if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
- goto out;
+ goto unprep;
if (!strcmp(key, "target")) {
u64 v;
@@ -875,9 +875,9 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
else if (sscanf(val, "%llu", &v) == 1)
lat_val = v * NSEC_PER_USEC;
else
- goto out;
+ goto unprep;
} else {
- goto out;
+ goto unprep;
}
}
@@ -889,8 +889,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
if (oldval != iolat->min_lat_nsec)
iolatency_clear_scaling(blkg);
ret = 0;
-out:
- blkg_conf_exit(&ctx);
+
+unprep:
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 88986dde1e18..47052ba21d1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1353,21 +1353,21 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out_finish;
+ return ret;
if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
- goto out_finish;
+ goto close_bdev;
}
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- goto out_finish;
+ goto close_bdev;
ret = -EINVAL;
if (sscanf(ctx.body, "%llu", &v) != 1)
- goto out_finish;
+ goto unprep;
if (!v)
v = U64_MAX;
@@ -1381,8 +1381,12 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
tg_conf_updated(tg, false);
ret = 0;
-out_finish:
- blkg_conf_exit(&ctx);
+
+unprep:
+ blkg_conf_unprep(&ctx);
+
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
@@ -1537,17 +1541,17 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out_finish;
+ return ret;
if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
- goto out_finish;
+ goto close_bdev;
}
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- goto out_finish;
+ goto close_bdev;
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
@@ -1573,11 +1577,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
p = tok;
strsep(&p, "=");
if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
- goto out_finish;
+ goto unprep;
ret = -ERANGE;
if (!val)
- goto out_finish;
+ goto unprep;
ret = -EINVAL;
if (!strcmp(tok, "rbps"))
@@ -1589,7 +1593,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
else
- goto out_finish;
+ goto unprep;
}
tg->bps[READ] = v[0];
@@ -1599,8 +1603,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg_conf_updated(tg, false);
ret = 0;
-out_finish:
- blkg_conf_exit(&ctx);
+unprep:
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
^ permalink raw reply related
* [PATCH v6 10/14] block/blk-iocost: Inline iocg_lock() and iocg_unlock()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Both iocg_lock() and iocg_unlock() use conditional locking. Fold these
functions into their callers such that unlocking becomes unconditional.
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-iocost.c | 49 +++++++++++++++++++---------------------------
1 file changed, 20 insertions(+), 29 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 8f1468444e97..e58274bc83e0 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -727,26 +727,6 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
put_cpu_ptr(gcs);
}
-static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
-{
- if (lock_ioc) {
- spin_lock_irqsave(&iocg->ioc->lock, *flags);
- spin_lock(&iocg->waitq.lock);
- } else {
- spin_lock_irqsave(&iocg->waitq.lock, *flags);
- }
-}
-
-static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
-{
- if (unlock_ioc) {
- spin_unlock(&iocg->waitq.lock);
- spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
- } else {
- spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
- }
-}
-
#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>
@@ -1585,13 +1565,17 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
bool pay_debt = READ_ONCE(iocg->abs_vdebt);
struct ioc_now now;
- unsigned long flags;
ioc_now(iocg->ioc, &now);
- iocg_lock(iocg, pay_debt, &flags);
- iocg_kick_waitq(iocg, pay_debt, &now);
- iocg_unlock(iocg, pay_debt, &flags);
+ if (pay_debt) {
+ guard(spinlock_irqsave)(&iocg->ioc->lock);
+ guard(spinlock)(&iocg->waitq.lock);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ } else {
+ guard(spinlock_irqsave)(&iocg->waitq.lock);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ }
return HRTIMER_NORESTART;
}
@@ -2706,7 +2690,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
u64 abs_cost, cost, vtime;
bool use_debt, ioc_locked;
enum over_budget_action action;
- unsigned long flags;
/* bypass IOs if disabled, still initializing, or for root cgroup */
if (!ioc->enabled || !iocg || !iocg->level)
@@ -2745,10 +2728,18 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
retry_lock:
- iocg_lock(iocg, ioc_locked, &flags);
- action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait, use_debt,
- ioc_locked, abs_cost, cost);
- iocg_unlock(iocg, ioc_locked, &flags);
+ if (ioc_locked) {
+ guard(spinlock_irqsave)(&iocg->ioc->lock);
+ guard(spinlock)(&iocg->waitq.lock);
+ action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+ use_debt, ioc_locked, abs_cost,
+ cost);
+ } else {
+ guard(spinlock_irqsave)(&iocg->waitq.lock);
+ action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+ use_debt, ioc_locked, abs_cost,
+ cost);
+ }
switch (action) {
case action_retry:
ioc_locked = true;
^ permalink raw reply related
* [PATCH v6 07/14] block/cgroup: Inline blkg_conf_{open,close}_bdev_frozen()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
The blkg_conf_open_bdev_frozen() calling convention is not compatible
with lock context annotations. Fold both blkg_conf_open_bdev_frozen()
and blkg_conf_close_bdev_frozen() into their only caller. This patch
prepares for enabling lock context analysis.
The type of 'memflags' has been changed from unsigned long into unsigned
int to match the type of current->flags. See also <linux/sched.h>.
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-cgroup.c | 46 ----------------------------------------------
block/blk-cgroup.h | 4 ----
block/blk-iocost.c | 25 +++++++++++++++++++------
3 files changed, 19 insertions(+), 56 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 86513c54c217..de0f753b8fe5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -812,39 +812,6 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_open_bdev);
-/*
- * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
- * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
- *
- * This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to
- * blkg_conf_close_bdev_frozen() for restoring the memalloc scope.
- */
-unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
-{
- int ret;
- unsigned long memflags;
-
- if (ctx->bdev)
- return -EINVAL;
-
- ret = blkg_conf_open_bdev(ctx);
- if (ret < 0)
- return ret;
- /*
- * At this point, we haven’t started protecting anything related to QoS,
- * so we release q->rq_qos_mutex here, which was first acquired in blkg_
- * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
- * the queue to maintain the correct locking order.
- */
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
- mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- return memflags;
-}
-
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -991,19 +958,6 @@ void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_close_bdev);
-/*
- * Similar to blkg_close_bdev, but also unfreezes the queue. Should be used
- * when blkg_conf_open_bdev_frozen is used to open the bdev.
- */
-void blkg_conf_close_bdev_frozen(struct blkg_conf_ctx *ctx,
- unsigned long memflags)
-{
- struct request_queue *q = ctx->bdev->bd_queue;
-
- blkg_conf_close_bdev(ctx);
- blk_mq_unfreeze_queue(q, memflags);
-}
-
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
int i;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index f0a3af520c55..f25fecb87c43 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -220,7 +220,6 @@ struct blkg_conf_ctx {
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
__cond_acquires(0, &ctx->bdev->bd_queue->rq_qos_mutex);
-unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx)
__cond_acquires(0, &ctx->bdev->bd_disk->queue->queue_lock);
@@ -228,9 +227,6 @@ void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
__releases(ctx->bdev->bd_disk->queue->queue_lock);
void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_queue->rq_qos_mutex);
-void blkg_conf_close_bdev_frozen(struct blkg_conf_ctx *ctx,
- unsigned long memflags)
- __releases(&ctx->bdev->bd_queue->rq_qos_mutex);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 050bfbc6d806..302388e99588 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3233,19 +3233,30 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct blkg_conf_ctx ctx;
+ struct request_queue *q;
struct gendisk *disk;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
- unsigned long memflags;
- int ret = 0;
+ unsigned int memflags;
+ int ret;
blkg_conf_init(&ctx, input);
- memflags = blkg_conf_open_bdev_frozen(&ctx);
- if (IS_ERR_VALUE(memflags))
- return memflags;
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ return ret;
+ /*
+ * At this point, we haven’t started protecting anything related to QoS,
+ * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+ * the queue to maintain the correct locking order.
+ */
+ mutex_unlock(&ctx.bdev->bd_queue->rq_qos_mutex);
+
+ memflags = blk_mq_freeze_queue(ctx.bdev->bd_queue);
+ mutex_lock(&ctx.bdev->bd_queue->rq_qos_mutex);
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3363,7 +3374,9 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blk_mq_unquiesce_queue(disk->queue);
close_bdev:
- blkg_conf_close_bdev_frozen(&ctx, memflags);
+ q = ctx.bdev->bd_queue;
+ blkg_conf_close_bdev(&ctx);
+ blk_mq_unfreeze_queue(q, memflags);
return ret ?: nbytes;
einval:
^ permalink raw reply related
* [PATCH v6 08/14] block/crypto: Annotate the crypto functions
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Eric Biggers, Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Add the lock context annotations required for Clang's thread-safety
analysis.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-crypto-profile.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 4ac74443687a..cf447ba4a66e 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -43,6 +43,7 @@ struct blk_crypto_keyslot {
};
static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+ __acquires(&profile->lock)
{
/*
* Calling into the driver requires profile->lock held and the device
@@ -55,6 +56,7 @@ static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
}
static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+ __releases(&profile->lock)
{
up_write(&profile->lock);
if (profile->dev)
^ permalink raw reply related
* [PATCH v6 06/14] block/blk-iocost: Combine two error paths in ioc_qos_write()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Reduce code duplication by combining two error paths. No functionality
has been changed.
Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-iocost.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 3e4e28ecc21f..050bfbc6d806 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3239,7 +3239,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
bool enable, user;
char *body, *p;
unsigned long memflags;
- int ret;
+ int ret = 0;
blkg_conf_init(&ctx, input);
@@ -3251,14 +3251,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
disk = ctx.bdev->bd_disk;
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
- goto err;
+ goto close_bdev;
}
ioc = q_to_ioc(disk->queue);
if (!ioc) {
ret = blk_iocost_init(disk);
if (ret)
- goto err;
+ goto close_bdev;
ioc = q_to_ioc(disk->queue);
}
@@ -3362,15 +3362,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blk_mq_unquiesce_queue(disk->queue);
+close_bdev:
blkg_conf_close_bdev_frozen(&ctx, memflags);
- return nbytes;
+ return ret ?: nbytes;
+
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
ret = -EINVAL;
-err:
- blkg_conf_close_bdev_frozen(&ctx, memflags);
- return ret;
+ goto close_bdev;
}
static u64 ioc_cost_model_prfill(struct seq_file *sf,
^ permalink raw reply related
* [PATCH v6 05/14] block/cgroup: Improve lock context annotations
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Add lock context annotations where these are missing. Move the
blkg_conf_prep() annotation into block/blk-cgroup.h to make it visible
to all blkg_conf_prep() callers.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/blk-cgroup.c | 1 -
block/blk-cgroup.h | 15 ++++++++++-----
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 38d7bcfcbbe8..86513c54c217 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -862,7 +862,6 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx)
- __acquires(&bdev->bd_queue->queue_lock)
{
struct gendisk *disk;
struct request_queue *q;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ce90f5b60d52..f0a3af520c55 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -218,14 +218,19 @@ struct blkg_conf_ctx {
};
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
-int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
+ __cond_acquires(0, &ctx->bdev->bd_queue->rq_qos_mutex);
unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- struct blkg_conf_ctx *ctx);
-void blkg_conf_unprep(struct blkg_conf_ctx *ctx);
-void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx);
+ struct blkg_conf_ctx *ctx)
+ __cond_acquires(0, &ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
+ __releases(ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex);
void blkg_conf_close_bdev_frozen(struct blkg_conf_ctx *ctx,
- unsigned long memflags);
+ unsigned long memflags)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
^ permalink raw reply related
* [PATCH v6 03/14] block/cgroup: Split blkg_conf_prep()
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Tejun Heo, Josef Bacik, Yu Kuai
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Move the blkg_conf_open_bdev() call out of blkg_conf_prep() to make it
possible to add lock context annotations to blkg_conf_prep(). Change an
if-statement in blkg_conf_open_bdev() into a WARN_ON_ONCE() call. Export
blkg_conf_open_bdev() because it is called by the BFQ I/O scheduler and
the BFQ I/O scheduler may be built as a kernel module.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/bfq-cgroup.c | 4 ++++
block/blk-cgroup.c | 18 ++++++++----------
block/blk-iocost.c | 4 ++++
3 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 37ab70930c8d..df7b5a646e96 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1052,6 +1052,10 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out;
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
if (ret)
goto out;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 554c87bb4a86..a8d95d51b866 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -771,10 +771,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_init);
* @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
* set to point past the device node prefix.
*
- * This function may be called multiple times on @ctx and the extra calls become
- * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
- * explicitly if bdev access is needed without resolving the blkcg / policy part
- * of @ctx->input. Returns -errno on error.
+ * Returns: -errno on error.
*/
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
@@ -783,8 +780,8 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
struct block_device *bdev;
int key_len;
- if (ctx->bdev)
- return 0;
+ if (WARN_ON_ONCE(ctx->bdev))
+ return -EINVAL;
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return -EINVAL;
@@ -813,6 +810,8 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
+EXPORT_SYMBOL_GPL(blkg_conf_open_bdev);
+
/*
* Similar to blkg_conf_open_bdev, but additionally freezes the queue,
* ensures the correct locking order between freeze queue and q->rq_qos_mutex.
@@ -857,7 +856,7 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
* following MAJ:MIN, @ctx->bdev points to the target block device and
* @ctx->blkg to the blkg being configured.
*
- * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
+ * blkg_conf_open_bdev() must be called on @ctx beforehand. On success, this
* function returns with queue lock held and must be followed by
* blkg_conf_exit().
*/
@@ -870,9 +869,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkcg_gq *blkg;
int ret;
- ret = blkg_conf_open_bdev(ctx);
- if (ret)
- return ret;
+ if (WARN_ON_ONCE(!ctx->bdev))
+ return -EINVAL;
disk = ctx->bdev->bd_disk;
q = disk->queue;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 0cca88a366dc..b34f820dedcc 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3140,6 +3140,10 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto err;
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
if (ret)
goto err;
^ permalink raw reply related
* [PATCH v6 02/14] block/bdev: Annotate the blk_holder_ops callback functions
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
The four callback functions in blk_holder_ops all release the
bd_holder_lock. Annotate these functions accordingly.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
include/linux/blkdev.h | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 65efbd7fe1a3..57e84d59a642 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1746,22 +1746,26 @@ void blkdev_show(struct seq_file *seqf, off_t offset);
#endif
struct blk_holder_ops {
- void (*mark_dead)(struct block_device *bdev, bool surprise);
+ void (*mark_dead)(struct block_device *bdev, bool surprise)
+ __releases(&bdev->bd_holder_lock);
/*
* Sync the file system mounted on the block device.
*/
- void (*sync)(struct block_device *bdev);
+ void (*sync)(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock);
/*
* Freeze the file system mounted on the block device.
*/
- int (*freeze)(struct block_device *bdev);
+ int (*freeze)(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock);
/*
* Thaw the file system mounted on the block device.
*/
- int (*thaw)(struct block_device *bdev);
+ int (*thaw)(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock);
};
/*
^ permalink raw reply related
* [PATCH v6 00/14] Enable lock context analysis for the block layer core
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche
Hi Jens,
Recently the following patch series has been merged: [PATCH v5 00/36]
Compiler-Based Context- and Locking-Analysis
(https://lore.kernel.org/lkml/20251219154418.3592607-1-elver@google.com/). That
patch series drops support for verifying lock context annotations with sparse
and introduces support for verifying lock context annotations with Clang. The
support in Clang for lock context annotation and verification is better than
that in sparse. As an example, __cond_acquires() and __guarded_by() are
supported by Clang but not by sparse. Hence this patch series that enables lock
context analysis for the block layer core.
Note: although the Linux kernel documentation specifies 22 as minimal
version for Clang for context analysis support, this patch series requires
Clang 23 because it annotates function pointers. As one can see here, a patch
has been queued that fixes the kernel documentation:
https://lore.kernel.org/all/177926568868.711.3058599932884307249.tip-bot2@tip-bot2/
Please consider this patch series for the next merge window.
Thanks,
Bart.
Changes compared to v5:
- Made the description of patch "block/cgroup: Inline
blkg_conf_{open,close}_bdev_frozen()" more clear.
- Combined two error paths in ioc_qos_write().
- Folded iocg_lock() and iocg_unlock() into their callers. Removed the token
context lock "ioc_lock".
- Introduced macros in the debugfs code that capture expressions with
pointer casts.
- Added two patches: "block/blk-iocost: Combine two error paths in
ioc_qos_write()" and "block/blk-iocost: Split ioc_rqos_throttle()".
Changes compared to v4:
- Rebased and retested on top of Jens' latest for-next branch.
Changes compared to v3:
- Replaced the "block/bdev: Annotate the blk_holder_ops callback invocations"
patch with a patch that adds __releases() annotations to the function
pointers in struct blk_holder_ops.
- Dropped the blk-zoned patch since a better patch from Christoph has
been merged.
Changes compared to v2:
- Retained the block layer core patches and left out the block driver patches.
- Inlined blkg_conf_open_bdev_frozen() and blkg_conf_close_bdev_frozen().
- In blkg_conf_open_bdev(), added a return statement if the
WARN_ON_ONCE() statement triggers.
- Replaced the "block/ioctl: Add lock context annotations" patch with a
__release() annotation.
- Replaced the blk-zoned patch with a patch from Christoph.
Changes compared to v1:
- Rebased this patch series on top of Jens' for-next branch.
- Included two patches that split blkg_conf_prep() and blkg_conf_exit().
- Modified how patches are split. Split the block layer core patch into
multiple patches and moved the CONTEXT_ANALYSIS := y assignments into the
block driver patches.
- Made the new source code comments easier to comprehend.
- Introduced macros in the mq-deadline and Kyber I/O schedulers to make the
__acquires() expressions easier to read.
- Removed the changes from this series that are not block layer changes.
Bart Van Assche (14):
block: Annotate the queue limits functions
block/bdev: Annotate the blk_holder_ops callback functions
block/cgroup: Split blkg_conf_prep()
block/cgroup: Split blkg_conf_exit()
block/cgroup: Improve lock context annotations
block/blk-iocost: Combine two error paths in ioc_qos_write()
block/cgroup: Inline blkg_conf_{open,close}_bdev_frozen()
block/crypto: Annotate the crypto functions
block/blk-iocost: Split ioc_rqos_throttle()
block/blk-iocost: Inline iocg_lock() and iocg_unlock()
block/blk-mq-debugfs: Improve lock context annotations
block/Kyber: Make the lock context annotations compatible with Clang
block/mq-deadline: Make the lock context annotations compatible with
Clang
block: Enable lock context analysis
block/Makefile | 2 +
block/bfq-cgroup.c | 11 +-
block/blk-cgroup.c | 98 +++---------
block/blk-cgroup.h | 13 +-
block/blk-crypto-profile.c | 2 +
block/blk-iocost.c | 302 ++++++++++++++++++++-----------------
block/blk-iolatency.c | 19 ++-
block/blk-mq-debugfs.c | 24 ++-
block/blk-throttle.c | 34 +++--
block/blk.h | 4 +
block/kyber-iosched.c | 7 +-
block/mq-deadline.c | 12 +-
include/linux/blkdev.h | 21 ++-
13 files changed, 291 insertions(+), 258 deletions(-)
^ permalink raw reply
* [PATCH v6 01/14] block: Annotate the queue limits functions
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Let the thread-safety checker verify whether every start of a queue
limits update is followed by a call to a function that finishes a queue
limits update.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
include/linux/blkdev.h | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 17270a28c66d..65efbd7fe1a3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1092,15 +1092,17 @@ static inline unsigned int blk_boundary_sectors_left(sector_t offset,
*/
static inline struct queue_limits
queue_limits_start_update(struct request_queue *q)
+ __acquires(&q->limits_lock)
{
mutex_lock(&q->limits_lock);
return q->limits;
}
int queue_limits_commit_update_frozen(struct request_queue *q,
- struct queue_limits *lim);
+ struct queue_limits *lim) __releases(&q->limits_lock);
int queue_limits_commit_update(struct request_queue *q,
- struct queue_limits *lim);
-int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
+ struct queue_limits *lim) __releases(&q->limits_lock);
+int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
+ __must_not_hold(&q->limits_lock);
int blk_validate_limits(struct queue_limits *lim);
/**
@@ -1112,6 +1114,7 @@ int blk_validate_limits(struct queue_limits *lim);
* starting update.
*/
static inline void queue_limits_cancel_update(struct request_queue *q)
+ __releases(&q->limits_lock)
{
mutex_unlock(&q->limits_lock);
}
^ permalink raw reply related
* Re: [PATCH] block/partitions/acorn: use min in {riscix,linux}_partition
From: Kees Cook @ 2026-06-02 16:49 UTC (permalink / raw)
To: Thorsten Blum; +Cc: Jens Axboe, Josh Law, linux-block, linux-kernel
In-Reply-To: <20260602160757.973736-3-thorsten.blum@linux.dev>
On Tue, Jun 02, 2026 at 06:07:57PM +0200, Thorsten Blum wrote:
> Use min() to replace the open-coded implementations and to simplify
> riscix_partition() and linux_partition().
>
> Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Seems good.
Reviewed-by: Kees Cook <kees@kernel.org>
--
Kees Cook
^ permalink raw reply
* Re: [PATCH RFC 7/8] erofs: open via dedicated fs bdev helpers
From: Gao Xiang @ 2026-06-02 16:25 UTC (permalink / raw)
To: Christian Brauner
Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
Gao Xiang, linux-erofs, Christoph Hellwig, Jan Kara
In-Reply-To: <20260602-work-super-bdev_holder_global-v1-7-bb0fd82f3861@kernel.org>
On 2026/6/2 18:10, Christian Brauner wrote:
> Route opens through fs_bdev_file_open_by_path() so each external device
> is registered against the correct superblock, and convert the matching
> releases.
>
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
> ---
> fs/erofs/data.c | 6 +++++
> fs/erofs/internal.h | 10 ++++++++
> fs/erofs/super.c | 66 +++++++++++++++++++++++++++++++++++++++++++----------
> fs/erofs/zdata.c | 10 +++++---
> 4 files changed, 77 insertions(+), 15 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 44da21c9d777..5220585293df 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -69,6 +69,9 @@ int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
> {
> struct erofs_sb_info *sbi = EROFS_SB(sb);
>
> + if (erofs_is_shutdown(sb))
> + return -EIO;
> +
> buf->file = NULL;
> if (in_metabox) {
> if (unlikely(!sbi->metabox_inode))
> @@ -236,6 +239,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
> }
> up_read(&devs->rwsem);
> }
> + if (erofs_is_shutdown(sb) ||
> + (map->m_dif && READ_ONCE(map->m_dif->dead)))
> + return -EIO;
Take a quick look at the code, maybe we can just add
the SHUTDOWN status only since I don't think remove an
individual blob device is useful for the typical image
use cases, so there is no need adding `dead` for each
individual extra device.
and just bail out if erofs_is_shutdown() at the very
beginning of erofs_map_dev()?
> return 0;
> }
>
...
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 43bb5a6a9924..89ae91935364 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -1697,11 +1697,15 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f,
> continue;
> }
>
> - /* no device id here, thus it will always succeed */
> mdev = (struct erofs_map_dev) {
> .m_pa = round_down(pcl->pos, sb->s_blocksize),
> };
> - (void)erofs_map_dev(sb, &mdev);
> + if (erofs_map_dev(sb, &mdev)) {
> + /* the backing device is gone; fail the batch */
> + q[JQ_SUBMIT]->eio = true;
> + qtail[JQ_SUBMIT] = &pcl->next;
> + continue;
> + }
It needs some injection tests anyway.
May I ask if it's an urgent 7.2 work? If not, I could
make a preparation patch for the upcoming 7.2 cycle
to handle erofs_map_dev() failure here so you don't
need to bother with this in this patchset.
I will seek more time to resolve the recent todos
yet always intercepted by other unrelated stuffs.
Thanks,
Gao Xaing
^ permalink raw reply
* Re: [PATCH] blk-iolatency: fix child_lat lock irq state
From: Yu Kuai @ 2026-06-02 16:23 UTC (permalink / raw)
To: Yu Kuai, tj, axboe; +Cc: linux-block
In-Reply-To: <20260601080235.980914-1-yukuai@fygo.io>
Hi,
在 2026/6/1 16:02, Yu Kuai 写道:
> iolatency_clear_scaling() updates child_lat.lock with hardirqs enabled.
> The bio completion path can take the same lock from hardirq context.
>
> This triggers lockdep after io.latency is configured and I/O completes.
> Full lockdep report:
>
> WARNING: inconsistent lock state
> 7.1.0-rc2-g6a04b2279273 #1 Not tainted
> --------------------------------
> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
> ffff88810c682d10 (&iolat->child_lat.lock){?.+.}-{3:3}, at: blkcg_iolatency_done_bio+0x6e7/0xb90
> {HARDIRQ-ON-W} state was registered at:
> lock_acquire+0xd4/0x290
> _raw_spin_lock+0x3a/0x70
> iolatency_set_limit+0x49b/0x590
Please ignore this patch, this trace is because I refactor locally to convert
protecting blkg from queue_lock to blkcg_mutex, and spin_lock_irq(&q->queue_lock)
is removed from blkg_conf_prep(), hence irq is no longer enabled.
I should have checked if this problem is introduced by myself. Sorry for
the noise.
> cgroup_file_write+0x1c5/0x4b0
> kernfs_fop_write_iter+0x1d7/0x280
> vfs_write+0x580/0x630
> ksys_write+0xec/0x190
> do_syscall_64+0x156/0x490
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> irq event stamp: 328476
> hardirqs last enabled at (328475): [<ffffffffa4dd93b1>] do_idle+0x261/0x400
> hardirqs last disabled at (328476): [<ffffffffa68347f3>] common_interrupt+0x13/0x90
> softirqs last enabled at (328398): [<ffffffffa4d508ac>] __irq_exit_rcu+0x8c/0x150
> softirqs last disabled at (328387): [<ffffffffa4d508ac>] __irq_exit_rcu+0x8c/0x150
>
> other info that might help us debug this:
> Possible unsafe locking scenario:
>
> CPU0
> ----
> lock(&iolat->child_lat.lock);
> <Interrupt>
> lock(&iolat->child_lat.lock);
>
> *** DEADLOCK ***
>
> 1 lock held by swapper/0/0:
> #0: ffff888103365450 (&virtscsi_vq->vq_lock){-.-.}-{3:3}, at: virtscsi_vq_done+0x9f/0x130
>
> stack backtrace:
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 7.1.0-rc2-g6a04b2279273 #1 PREEMPT 1c49bdb9e32f352d2b66a5ca23d36d656c610458
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014
> Call Trace:
> <IRQ>
> dump_stack_lvl+0x54/0x70
> print_usage_bug+0x26d/0x280
> mark_lock_irq+0x3ef/0x400
> ? save_trace+0x3d/0x2f0
> ? __pfx_stack_trace_consume_entry+0x10/0x10
> mark_lock+0x117/0x190
> __lock_acquire+0x570/0x2850
> ? stack_trace_save+0xa1/0xe0
> ? __pfx_stack_trace_save+0x10/0x10
> ? filter_irq_stacks+0x27/0x80
> ? stack_depot_save_flags+0x32/0x7f0
> lock_acquire+0xd4/0x290
> ? blkcg_iolatency_done_bio+0x6e7/0xb90
> ? kvm_sched_clock_read+0x11/0x20
> ? local_clock_noinstr+0xc/0xc0
> ? local_clock+0x15/0x30
> ? lock_release+0x111/0x470
> ? blkcg_iolatency_done_bio+0x6e7/0xb90
> _raw_spin_lock_irqsave+0x4c/0x90
> ? blkcg_iolatency_done_bio+0x6e7/0xb90
> blkcg_iolatency_done_bio+0x6e7/0xb90
> ? __pfx_blkcg_iolatency_done_bio+0x10/0x10
> __rq_qos_done_bio+0x51/0x60
> bio_endio+0x135/0x320
> blk_update_request+0x1e6/0x570
> scsi_end_request+0x4b/0x410
> scsi_io_completion+0x83/0x170
> ? __pfx_virtscsi_complete_cmd+0x10/0x10
> virtscsi_vq_done+0xd7/0x130
> ? lock_acquire+0xd4/0x290
> ? __pfx_virtscsi_vq_done+0x10/0x10
> ? local_clock_noinstr+0xc/0xc0
> ? local_clock+0x15/0x30
> vring_interrupt+0x13b/0x150
> ? __pfx_vring_interrupt+0x10/0x10
> __handle_irq_event_percpu+0x145/0x4b0
> handle_irq_event+0x54/0xb0
> handle_edge_irq+0x111/0x320
> __common_interrupt+0x97/0xf0
> common_interrupt+0x7e/0x90
> </IRQ>
> <TASK>
> asm_common_interrupt+0x26/0x40
> RIP: 0010:pv_native_safe_halt+0x13/0x20
> Code: d3 a5 01 00 cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 2f 39 21 00 f3 0f 1e fa fb f4 <c3> cc cc cc cc cc 0f 1f 80 00 00 00 00 90 90 90 90 90 90 90 90 90
> RSP: 0018:ffffffffa7607e00 EFLAGS: 00000246
> RAX: 000000000005031b RBX: ffffffffa4dd93b1 RCX: ffffffffa683884b
> RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffffffa4dd93b1
> RBP: ffffffffa7607ed0 R08: ffff888117bf408b R09: 1ffff11022f7e811
> R10: dffffc0000000000 R11: ffffed1022f7e812 R12: 0000000000000000
> R13: 0000000000000000 R14: 0000000000000000 R15: ffffffffa7f6cff0
> ? do_idle+0x261/0x400
> ? ct_kernel_exit+0xcb/0x110
> ? do_idle+0x261/0x400
> default_idle+0x9/0x20
> default_idle_call+0x73/0xb0
> do_idle+0x261/0x400
> ? __pfx_do_idle+0x10/0x10
> ? local_clock_noinstr+0x30/0xc0
> ? local_clock+0x15/0x30
> cpu_startup_entry+0x36/0x40
> rest_init+0x207/0x210
> start_kernel+0x321/0x370
> x86_64_start_reservations+0x24/0x30
> x86_64_start_kernel+0x13a/0x140
> common_startup_64+0x13e/0x147
> </TASK>
>
> Fix it by using spin_lock_irqsave() in iolatency_clear_scaling().
> Use irqsave rather than spin_lock_irq() because the same helper is also
> called from pd_offline_fn paths where hardirqs can already be disabled
> by blkcg teardown/deactivation locks. spin_unlock_irq() would wrongly
> enable hardirqs in those paths.
>
> Fixes: d70675121546 ("block: introduce blk-iolatency io controller")
> Signed-off-by: Yu Kuai <yukuai@fygo.io>
> ---
> block/blk-iolatency.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
> index 53e8dd2dfa8a..9152dc86b08b 100644
> --- a/block/blk-iolatency.c
> +++ b/block/blk-iolatency.c
> @@ -811,16 +811,18 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
> if (blkg->parent) {
> struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
> struct child_latency_info *lat_info;
> + unsigned long flags;
> +
> if (!iolat)
> return;
>
> lat_info = &iolat->child_lat;
> - spin_lock(&lat_info->lock);
> + spin_lock_irqsave(&lat_info->lock, flags);
> atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
> lat_info->last_scale_event = 0;
> lat_info->scale_grp = NULL;
> lat_info->scale_lat = 0;
> - spin_unlock(&lat_info->lock);
> + spin_unlock_irqrestore(&lat_info->lock, flags);
> }
> }
>
--
Thansk,
Kuai
^ permalink raw reply
* Re: [PATCH RFC 0/8] fs: support freeze/thaw/mark_dead/sync with shared devices
From: Gao Xiang @ 2026-06-02 16:12 UTC (permalink / raw)
To: Christian Brauner
Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
Gao Xiang, linux-erofs, Christoph Hellwig, Jan Kara
In-Reply-To: <20260602-work-super-bdev_holder_global-v1-0-bb0fd82f3861@kernel.org>
Hi,
On 2026/6/2 18:10, Christian Brauner wrote:
> Note, this is on the border between RFC/POC and so I haven't pushed this
> through testing yet. But I don't want to waste more time on this before
> showing it.
>
> I surveyed various fs implementations because I want the ability to
> extend userspace the ability to manage what devices can be onlined in a
> centralized way without having to force every fs to care about this.
>
> I realized that erofs allows sharing block devices with multiple
> superblocks. Any freeze, thaw, removal, or sync on those devices will
> not be communicated to the superblocks using it and our current
> infrastructure is unable to deal with this.
>
> This attempts to add the ability to go from device number to all the
> superblock using that device, iterate through them one-by-one and
> perform actions on them. For most fses this is a 1:1 mapping but for
> erofs its a 1:many mapping.
>
> This is not unreasonable infastructure to support in my opinion. I
> played around with some ideas for this and I want to send out an RFC to
> gather some early input.
Yes, just a side note: On the erofs side, since we apply immutable
model to each filesystems rather than writable filesystem approaches
so inode data (in devices or files) can be shared among multiple
different filesystems without any reference count needs for example
(in the similar models: any write needs to be COWed using overlayfs
for example.), so blob devices are 1:many shared mapping by design.
One typical example is that we could convert each OCI tar layer
into an erofs blob, and use a metadata-only erofs to index these
converted erofs blobs so there is only one filesystem instead of
per-layer filesystems (it's called fsmerge in the containerd
implementation.), but each converted erofs blob can be shared
among different filesystems.
Another example is incremental diff updates, the primary device
can only contain incremental data and refer to the base image for
the remaining data; and base image can be shared too.
Thanks,
Gao Xiang
^ permalink raw reply
* Re: [PATCH] blk-iocost: use irq-safe locking in cgroup handlers
From: Yu Kuai @ 2026-06-02 16:11 UTC (permalink / raw)
To: Jens Axboe, Bart Van Assche, tj, josef; +Cc: linux-block, linux-kernel
In-Reply-To: <2f20263e-c589-491d-9be8-c2e3d56a9229@fygo.io>
Hi,
在 2026/6/2 23:56, Yu Kuai 写道:
> Hi,
>
> 在 2026/6/2 21:25, Jens Axboe 写道:
>> On 6/1/26 3:50 PM, Bart Van Assche wrote:
>>> On 5/31/26 11:13 PM, Yu Kuai wrote:
>>>> @@ -3378,14 +3378,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf,
>>>> if (!dname)
>>>> return 0;
>>>> - spin_lock(&ioc->lock);
>>>> + spin_lock_irq(&ioc->lock);
>>>> seq_printf(sf, "%s ctrl=%s model=linear "
>>>> "rbps=%llu rseqiops=%llu rrandiops=%llu "
>>>> "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
>>>> dname, ioc->user_cost_model ? "user" : "auto",
>>>> u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
>>>> u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
>>>> - spin_unlock(&ioc->lock);
>>>> + spin_unlock_irq(&ioc->lock);
>>>> return 0;
>>>> }
>>> This change is wrong. ioc_cost_model_prfill() only has one caller,
>>> namely blkcg_print_blkgs(). blkcg_print_blkgs() calls the above function
>>> with interrupts disabled. The spin_unlock_irq(&ioc->lock) at the end of
>>> the above function enables interrupts while q->queue_lock is held. If an
>>> interrupt happens on the same CPU core before q->queue_lock is unlocked,
>>> and that interrupt tries to lock q->queue_lock, a deadlock will occur.
>> Agree, it's broken. Which makes me suspect of the traces shown. Yu,
>> can you please shed some light on this?
> Looks like my reply is in your spam again :(
>
> The trace is from ioc_weight_write(), which do have the problem. And
> while reviewing related code, I'm wrong to think ioc_cost_model_prfill()
> have the same problem and changed it as well.
>
>> I've dropped it, thanks Bart.
> I'll send a v2, and only fix ioc_weight_write().
I just update the latest branch and try this patch, however I didn't reporduce
the problem. And turns out, blkg_conf_prep() already disable irq by
spin_lock_irq(&q->queue_lock). So there is no problem at all.
The trace I found is because there are some pending patches to convert
protecting blkcg from queue_lock to blkcg_mutex, and the
spin_lock_irq(&q->queue_lock) is removed.
Sorry for the noise, I should have checked if this problem was introduced
by myself first. And thanks Bart to catch it.
>
--
Thansk,
Kuai
^ permalink raw reply
* [PATCH] block/partitions/acorn: use min in {riscix,linux}_partition
From: Thorsten Blum @ 2026-06-02 16:07 UTC (permalink / raw)
To: Jens Axboe, Kees Cook, Josh Law; +Cc: Thorsten Blum, linux-block, linux-kernel
Use min() to replace the open-coded implementations and to simplify
riscix_partition() and linux_partition().
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
---
block/partitions/acorn.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index 9f7389f174d0..067d6a27a3bd 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -9,6 +9,7 @@
*/
#include <linux/buffer_head.h>
#include <linux/adfs_fs.h>
+#include <linux/minmax.h>
#include "check.h"
@@ -80,7 +81,7 @@ static int riscix_partition(struct parsed_partitions *state,
if (rr->magic == RISCIX_MAGIC) {
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+ unsigned long size = min(nr_sects, 2);
int part;
seq_buf_puts(&state->pp_buf, " <");
@@ -124,7 +125,7 @@ static int linux_partition(struct parsed_partitions *state,
{
Sector sect;
struct linux_part *linuxp;
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+ unsigned long size = min(nr_sects, 2);
seq_buf_puts(&state->pp_buf, " [Linux]");
^ permalink raw reply related
* [PATCH] mm: simplify the mempool_alloc_bulk API
From: Christoph Hellwig @ 2026-06-02 16:00 UTC (permalink / raw)
To: vbabka, harry, akpm
Cc: hao.li, cl, rientjes, roman.gushchin, linux-block, linux-mm
The mempool_alloc_bulk was modelled after the alloc_pages_bulk API,
including some misunderstanding of it.
Remove checking for NULL slots in the array, as alloc_pages_bulk and
kmem_cache_alloc_bulk always fill the array from the beginning and thus
we know the offset of the first failing allocation. This removes support
for working well with alloc_pages_bulk used to refill page arrays that
might have an entry removed from in the middle, but that is only used by
sunrpc and hopefully on it's way out.
Also remove the allocated parameter as it is redundant because the caller
can simply specific and offset into the entries array.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-crypto-fallback.c | 9 +++++----
include/linux/mempool.h | 2 +-
mm/mempool.c | 27 ++++++++++-----------------
3 files changed, 16 insertions(+), 22 deletions(-)
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 61f595410832..ab6924fba280 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -199,8 +199,8 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
pages += nr_segs * (PAGE_PTRS_PER_BVEC - 1);
/*
- * Try a bulk allocation first. This could leave random pages in the
- * array unallocated, but we'll fix that up later in mempool_alloc_bulk.
+ * Try a bulk allocation first. This might not fill all allocated
+ * pages, but we'll fix that up later in mempool_alloc_bulk.
*
* Note: alloc_pages_bulk needs the array to be zeroed, as it assumes
* any non-zero slot already contains a valid allocation.
@@ -208,8 +208,9 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
memset(pages, 0, sizeof(struct page *) * nr_segs);
nr_allocated = alloc_pages_bulk(GFP_KERNEL, nr_segs, pages);
if (nr_allocated < nr_segs)
- mempool_alloc_bulk(blk_crypto_bounce_page_pool, (void **)pages,
- nr_segs, nr_allocated);
+ mempool_alloc_bulk(blk_crypto_bounce_page_pool,
+ (void **)pages + nr_allocated,
+ nr_segs - nr_allocated);
memalloc_noio_restore(memflags);
*pages_ret = pages;
return bio;
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index e8e440e04a06..a0fa6d43e0dc 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -66,7 +66,7 @@ void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc;
#define mempool_alloc(...) \
alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
- unsigned int count, unsigned int allocated);
+ unsigned int count);
#define mempool_alloc_bulk(...) \
alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__))
diff --git a/mm/mempool.c b/mm/mempool.c
index db23e0eef652..473a029fa31f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -419,12 +419,8 @@ static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
spin_lock_irqsave(&pool->lock, flags);
if (unlikely(pool->curr_nr < count - allocated))
goto fail;
- for (i = 0; i < count; i++) {
- if (!elems[i]) {
- elems[i] = remove_element(pool);
- allocated++;
- }
- }
+ while (allocated < count)
+ elems[allocated++] = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* Paired with rmb in mempool_free(), read comment there. */
@@ -479,22 +475,21 @@ static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
* @pool: pointer to the memory pool
* @elems: partially or fully populated elements array
* @count: number of entries in @elem that need to be allocated
- * @allocated: number of entries in @elem already allocated
*
- * Allocate elements for each slot in @elem that is non-%NULL. This is done by
- * first calling into the alloc_fn supplied at pool initialization time, and
- * dipping into the reserved pool when alloc_fn fails to allocate an element.
+ * Allocate @count elements into @elems. This is done by first calling into the
+ * alloc_fn supplied at pool initialization time, and dipping into the reserved
+ * pool when alloc_fn fails to allocate an element.
*
* On return all @count elements in @elems will be populated.
*
* Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void.
*/
int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
- unsigned int count, unsigned int allocated)
+ unsigned int count)
{
gfp_t gfp_mask = GFP_KERNEL;
gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
- unsigned int i = 0;
+ unsigned int allocated = 0;
VM_WARN_ON_ONCE(count > pool->min_nr);
might_alloc(gfp_mask);
@@ -514,11 +509,9 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
* Try to allocate the elements using the allocation callback first as
* that might succeed even when the caller's bulk allocation did not.
*/
- for (i = 0; i < count; i++) {
- if (elems[i])
- continue;
- elems[i] = pool->alloc(gfp_temp, pool->pool_data);
- if (unlikely(!elems[i]))
+ while (allocated < count) {
+ elems[allocated] = pool->alloc(gfp_temp, pool->pool_data);
+ if (unlikely(!elems[allocated]))
goto use_pool;
allocated++;
}
--
2.53.0
^ permalink raw reply related
* Re: [PATCH] blk-iocost: use irq-safe locking in cgroup handlers
From: Yu Kuai @ 2026-06-02 15:56 UTC (permalink / raw)
To: Jens Axboe, Bart Van Assche, tj, josef; +Cc: linux-block, linux-kernel, yukuai
In-Reply-To: <ac87c25f-6708-4307-ad03-645f984dfb1c@kernel.dk>
Hi,
在 2026/6/2 21:25, Jens Axboe 写道:
> On 6/1/26 3:50 PM, Bart Van Assche wrote:
>> On 5/31/26 11:13 PM, Yu Kuai wrote:
>>> @@ -3378,14 +3378,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf,
>>> if (!dname)
>>> return 0;
>>> - spin_lock(&ioc->lock);
>>> + spin_lock_irq(&ioc->lock);
>>> seq_printf(sf, "%s ctrl=%s model=linear "
>>> "rbps=%llu rseqiops=%llu rrandiops=%llu "
>>> "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
>>> dname, ioc->user_cost_model ? "user" : "auto",
>>> u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
>>> u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
>>> - spin_unlock(&ioc->lock);
>>> + spin_unlock_irq(&ioc->lock);
>>> return 0;
>>> }
>> This change is wrong. ioc_cost_model_prfill() only has one caller,
>> namely blkcg_print_blkgs(). blkcg_print_blkgs() calls the above function
>> with interrupts disabled. The spin_unlock_irq(&ioc->lock) at the end of
>> the above function enables interrupts while q->queue_lock is held. If an
>> interrupt happens on the same CPU core before q->queue_lock is unlocked,
>> and that interrupt tries to lock q->queue_lock, a deadlock will occur.
> Agree, it's broken. Which makes me suspect of the traces shown. Yu,
> can you please shed some light on this?
Looks like my reply is in your spam again :(
The trace is from ioc_weight_write(), which do have the problem. And
while reviewing related code, I'm wrong to think ioc_cost_model_prfill()
have the same problem and changed it as well.
>
> I've dropped it, thanks Bart.
I'll send a v2, and only fix ioc_weight_write().
>
--
Thansk,
Kuai
^ permalink raw reply
* Re: configurable block error injection
From: Christoph Hellwig @ 2026-06-02 15:05 UTC (permalink / raw)
To: Daniel Gomez
Cc: Christoph Hellwig, Jens Axboe, Jonathan Corbet, linux-block,
linux-doc, bpf, linux-kselftest, Luis Chamberlain,
Masami Hiramatsu, Brendan Gregg, GOST
In-Reply-To: <e4c653ec-dfb4-4dce-a565-2a43520fc44f@kernel.org>
On Tue, Jun 02, 2026 at 11:58:25AM +0200, Daniel Gomez wrote:
> I wonder if the block layer would be interested in moving block error
> injection off the should_fail() fault injection framework and extending
> the ALLOW_ERROR_INJECTION annotation instead and offloading all the
> debugfs configuration logic (block/error-injection.c) into eBPF?
I've looked into plain ALLOW_ERROR_INJECTION-based injection and it
is not very useful. I didn't even now eBPF could use it, but I
looked into other eBPF injections and at least for my uses cases
it was a bit of a mess. I'd have to allow access to certain bio
fields and would have create a stable UAPI for commands and status
using the fake BTF struct access which really would not be a good
idea here as we need to be able to change internals. Additionally
having fully BTF-enabled toolchains in test VMs is not great either.
I've also not actually found any good map type for range lookups,
which is kinda essential here.
> I talked about moderr [1] at LPC 2025. It's a simple error injection
> tool in eBPF for the module subsystem. The suggested direction there was
> to generalize the tool to ideally to no tool at all, and leverage
> bpftrace to describe the error injection conditions a given
> subsystem needs to be tested under. That would let blktests, for
> example, absorb that and simplify the configuration logic this series
> adds in the kernel for debugfs.
I don't think pulling in ebpftrace for simple error injection is a
winning proposition..
>
> A previous attempt to add inline error injection [2] was rejected as too
> intrusive / source-polluting;
I'm not sure a single hand waivy comment counts as rejection, although
I'm not a huge fan of setup_fault_attr - it makes a mess of debugfs and
creates a lot of boilerplate for a single not very much configurable call
site. That might be ok for something like the make_request case
(although I think we can do better as shown in this patch), but for
making random functions fail it is a lot of overhead. These injections
points also are not anywhere near stable enough to be exposed.
^ permalink raw reply
* Re: [PATCH] make new mount API honour SB_NOUSER (was Re: [PATCH] block: Avoid mounting the bdev pseudo-filesystem in userspace)
From: Christian Brauner @ 2026-06-02 14:55 UTC (permalink / raw)
To: Linus Torvalds, Al Viro
Cc: Christian Brauner, Jan Kara, linux-fsdevel, Jens Axboe,
linux-block, linux-kernel, lvc-project, stable, Denis Arefev
In-Reply-To: <20260602020444.GP2636677@ZenIV>
On Tue, 02 Jun 2026 03:04:44 +0100, Al Viro wrote:
> one should *not* be allowed to mount one of those, new API or not.
Applied to the vfs-7.2.misc branch of the vfs/vfs.git tree.
Patches in the vfs-7.2.misc branch should appear in linux-next soon.
Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.
It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.
Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.
tree: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.2.misc
[1/1] mount: honour SB_NOUSER in the new mount API
https://git.kernel.org/vfs/vfs/c/67d8c452fae1
^ permalink raw reply
* Re: [PATCH] make new mount API honour SB_NOUSER (was Re: [PATCH] block: Avoid mounting the bdev pseudo-filesystem in userspace)
From: Al Viro @ 2026-06-02 14:54 UTC (permalink / raw)
To: Arefev
Cc: Jan Kara, Linus Torvalds, Christian Brauner, linux-fsdevel,
Jens Axboe, linux-block, linux-kernel, lvc-project, stable
In-Reply-To: <b106ed26-e6ec-4254-b337-1d2e8e2adc58@swemel.ru>
On Tue, Jun 02, 2026 at 04:23:21PM +0300, Arefev wrote:
> The sequence of system calls before the crash could be as follows:
>
> fsopen("bdev", ...)
> fsconfig(fd_fs, FSCONFIG_CMD_CREATE, 0,0,0)
> fsmount(fd_fs, 0,0)
> move_mount(fd_mnt, "", AT_FDCWD, "./file1", 0x46ul)
Huh? "file1" being a regular file or was it actually
a directory? AFAICS, the d_is_dir() mismatch would be rejected
by do_move_mount()...
> The system call executed at the time of the cras:
>
> open("/dev/media0", ...);
>
> Simplified stacktrace:
>
> path_openat
> |-> link_path_walk
> |-> walk_component
> |-> __lookup_slow
> |-> ld = inode->i_op->lookup(inode, dentry, flags); <- Oops
How the hell does that thing bound on top of "./file1" lead to
resolution of "/dev/media0" walking anywhere near it? Something's
missing here.
> Checking the fc->sb_flags flag before calling vfs_create_mount() is a great
> idea,
> if it helps prevent crashes in two more file systems, 'sockfs' and 'pipefs'.
Calling vfs_create_mount() is not a problem; refusing to attach
the result if SB_NOUSER has ended up in ->s_flags is the right
thing to do, but I still would like to understand how did this call
of walk_component() manage to evade
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return -ECHILD;
}
return -ENOTDIR;
}
on the previous iteration through link_path_walk() or, if it had been
the first one, the corresponding checks at chroot()/chdir()/fchdir() time.
Note that there are very legitimate objects with NULL ->lookup() - every
regular file is like that, obviously, but there also exist ones that look
like directories in mode bits, but still have NULL ->lookup(). See
d_flags_for_inode() and look for DCACHE_AUTODIR_TYPE there.
So whatever scenario has played out, you've got a call of walk_component()
with nd->path.dentry that should have failed d_can_lookup(). That ought
to have been prevented and this prevention would better be much closer
than anything fsmount(2) does.
Don't get me wrong - userland mounting of bdev and friends should not be
allowed, but that's not the only thing that went wrong in the reproducer.
BTW, how easy to trigger it is? Is that "you need to run for a few months
on a bunch of boxen" or "run this sequence and it'll crash that way"?
^ permalink raw reply
* Re: [PATCH 8/9] block: add configurable error injection
From: Christoph Hellwig @ 2026-06-02 14:46 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Jens Axboe, Jonathan Corbet, linux-block,
linux-doc, bpf, linux-kselftest
In-Reply-To: <ah6li1JOGrpXor9W@kbusch-mbp>
On Tue, Jun 02, 2026 at 10:42:35AM +0100, Keith Busch wrote:
> When nr_sectors is 0, it is reset to U64_MAX so overflows if start > 1.
Yeah.
> I think you want to remove overriding nr_sectors to U64_MAX and do:
>
> if (!nr_sectors)
> inj->end = U64_MAX;
> else if (U64_MAX - nr_sectors < start )
> return -EINVAL;
> else
> inj->end = start + nr_sectors - 1;
I ended up ordering a bit differently for better readability, but
yes.
> > + mutex_lock(&disk->error_injection_lock);
> > + if (!disk_live(disk)) {
> > + mutex_unlock(&disk->error_injection_lock);
> > + return -EINVAL;
>
> I think we've leaked 'inj' in this error case.
Yes.
>
> > + }
> > + list_add(&inj->entry, &disk->error_injection_list);
>
> The __blk_error_inject interates this list with
> "list_for_each_entry_rcu", so shouldn't this be list_add_rcu to match?
Yes.
> > +static const match_table_t opt_tokens = {
> > + { Opt_add, "add", },
> > + { Opt_removeall, "removeall", },
> > + { Opt_op, "op=%s", },
> > + { Opt_start, "start=%u" },
> > + { Opt_nr_sectors, "nr_sectors=%u" },
>
> Shouldn't start and nr_sectors use %llu?
lib/parser.c doesn't use those prefixes, it's a bit weird.
> > + if (!options)
> > + return -ENOMEM;
> > +
>
> On failure, memdup_user_nul returns an ERR_PTR rather than NULL.
>
> if (IS_ERR(options))
> return PTR_ERR(options);
Aarg, annoying. Because memdup_user does return NULL :(
>
> > + case Removeall:
> > + if (option_mask & ~Opt_removeall)
> > + return -EINVAL;
>
> Leaking "options"? Should this be:
>
> if (option_mask & ~Opt_removeall) {
> ret = -EINVAL;
> goto out_free_options;
> }
>
> ?
Yes.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox