* [PATCH v6 12/14] block/Kyber: Make the lock context annotations compatible with Clang
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
While sparse ignores the __acquires() and __releases() arguments, Clang
verifies these. Make the arguments of __acquires() and __releases()
acceptable for Clang.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/kyber-iosched.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b84163d1f851..971818bcdc9d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -882,6 +882,9 @@ static const struct elv_fs_entry kyber_sched_attrs[] = {
};
#undef KYBER_LAT_ATTR
+#define HCTX_FROM_SEQ_FILE(m) ((struct blk_mq_hw_ctx *)(m)->private)
+#define KYBER_HCTX_DATA(hctx) ((struct kyber_hctx_data *)(hctx)->sched_data)
+
#ifdef CONFIG_BLK_DEBUG_FS
#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
@@ -894,7 +897,7 @@ static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
} \
\
static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
- __acquires(&khd->lock) \
+ __acquires(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
@@ -913,7 +916,7 @@ static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
} \
\
static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
- __releases(&khd->lock) \
+ __releases(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
^ permalink raw reply related
* [PATCH v6 13/14] block/mq-deadline: Make the lock context annotations compatible with Clang
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche, Nathan Chancellor
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
While sparse ignores the __acquires() and __releases() arguments, Clang
verifies these. Make the arguments of __acquires() and __releases()
acceptable for Clang.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/mq-deadline.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 95917a88976f..824bfc17b2c6 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -794,11 +794,15 @@ static const struct elv_fs_entry deadline_attrs[] = {
__ATTR_NULL
};
+#define RQ_FROM_SEQ_FILE(m) ((struct request_queue *)(m)->private)
+#define DD_DATA_FROM_RQ(rq) \
+ ((struct deadline_data *)(rq)->elevator->elevator_data)
+
#ifdef CONFIG_BLK_DEBUG_FS
#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \
static void *deadline_##name##_fifo_start(struct seq_file *m, \
loff_t *pos) \
- __acquires(&dd->lock) \
+ __acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock) \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
@@ -819,7 +823,7 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
} \
\
static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
- __releases(&dd->lock) \
+ __releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock) \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
@@ -921,7 +925,7 @@ static int dd_owned_by_driver_show(void *data, struct seq_file *m)
}
static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&dd->lock)
+ __acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
{
struct request_queue *q = m->private;
struct deadline_data *dd = q->elevator->elevator_data;
@@ -939,7 +943,7 @@ static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
}
static void deadline_dispatch_stop(struct seq_file *m, void *v)
- __releases(&dd->lock)
+ __releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
{
struct request_queue *q = m->private;
struct deadline_data *dd = q->elevator->elevator_data;
^ permalink raw reply related
* [PATCH v6 14/14] block: Enable lock context analysis
From: Bart Van Assche @ 2026-06-02 17:07 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Bart Van Assche
In-Reply-To: <cover.1780419600.git.bvanassche@acm.org>
Now that all block/*.c files have been annotated, enable lock context
analysis for all these source files.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
block/Makefile | 2 ++
1 file changed, 2 insertions(+)
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..54130faacc21 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,6 +3,8 @@
# Makefile for the kernel block layer
#
+CONTEXT_ANALYSIS := y
+
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
^ permalink raw reply related
* Re: [PATCH] block/partitions/acorn: use min in {riscix,linux}_partition
From: Jens Axboe @ 2026-06-02 17:19 UTC (permalink / raw)
To: Kees Cook, Josh Law, Thorsten Blum; +Cc: linux-block, linux-kernel
In-Reply-To: <20260602160757.973736-3-thorsten.blum@linux.dev>
On Tue, 02 Jun 2026 18:07:57 +0200, Thorsten Blum wrote:
> Use min() to replace the open-coded implementations and to simplify
> riscix_partition() and linux_partition().
Applied, thanks!
[1/1] block/partitions/acorn: use min in {riscix,linux}_partition
commit: aa528cd12ca6e7fda15f855b6d2095fd34d167e0
Best regards,
--
Jens Axboe
^ permalink raw reply
* Re: [PATCH 8/9] block: add configurable error injection
From: Randy Dunlap @ 2026-06-02 17:56 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe
Cc: Jonathan Corbet, linux-block, linux-doc, bpf, linux-kselftest
In-Reply-To: <20260602054615.3788425-9-hch@lst.de>
On 6/1/26 10:45 PM, Christoph Hellwig wrote:
> diff --git a/Documentation/block/error-injection.rst b/Documentation/block/error-injection.rst
> new file mode 100644
> index 000000000000..be87091b5330
> --- /dev/null
> +++ b/Documentation/block/error-injection.rst
> @@ -0,0 +1,59 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +============================
> +Configurable Error Injection
> +============================
> +
> +Overview
> +--------
> +
> +Configurable error injection allows injecting specific block layer status codes
> +for ranges of a block device. Error can be injected unconditional, or with a
Errors can be injected unconditionally or with a
> +given probability.
> +
> +To use configurable error injection, CONFIG_FAIL_MAKE_REQUEST must be enabled.
> +
> +The only interface is the error_injection debugfs file, which is created for
> +each registered gendisk. Writes to this file are used to create or delete rules
> +and reads return a list of the current error injection sites.
[snip]
--
~Randy
^ permalink raw reply
* Re: [PATCH RESEND] n64cart: use strscpy in n64cart_probe
From: Thorsten Blum @ 2026-06-02 22:42 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, linux-kernel
In-Reply-To: <20260517172617.3954-2-thorsten.blum@linux.dev>
Gentle ping?
On Sun, May 17, 2026 at 07:26:17PM +0200, Thorsten Blum wrote:
> strcpy() has been deprecated [1] because it performs no bounds checking
> on the destination buffer, which can lead to buffer overflows. While the
> current code works correctly, replace strcpy() with the safer strscpy()
> to follow secure coding best practices.
>
> [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy
>
> Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
> ---
> drivers/block/n64cart.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c
> index b9fdeff31caf..328da73b6f2c 100644
> --- a/drivers/block/n64cart.c
> +++ b/drivers/block/n64cart.c
> @@ -12,6 +12,7 @@
> #include <linux/init.h>
> #include <linux/module.h>
> #include <linux/platform_device.h>
> +#include <linux/string.h>
>
> enum {
> PI_DRAM_REG = 0,
> @@ -145,7 +146,7 @@ static int __init n64cart_probe(struct platform_device *pdev)
> disk->flags = GENHD_FL_NO_PART;
> disk->fops = &n64cart_fops;
> disk->private_data = &pdev->dev;
> - strcpy(disk->disk_name, "n64cart");
> + strscpy(disk->disk_name, "n64cart");
>
> set_capacity(disk, size >> SECTOR_SHIFT);
> set_disk_ro(disk, 1);
^ permalink raw reply
* Re: [PATCH RESEND] n64cart: use strscpy in n64cart_probe
From: Jens Axboe @ 2026-06-02 23:44 UTC (permalink / raw)
To: Thorsten Blum; +Cc: linux-block, linux-kernel
In-Reply-To: <20260517172617.3954-2-thorsten.blum@linux.dev>
On Sun, 17 May 2026 19:26:17 +0200, Thorsten Blum wrote:
> strcpy() has been deprecated [1] because it performs no bounds checking
> on the destination buffer, which can lead to buffer overflows. While the
> current code works correctly, replace strcpy() with the safer strscpy()
> to follow secure coding best practices.
>
> [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy
>
> [...]
Applied, thanks!
[1/1] n64cart: use strscpy in n64cart_probe
commit: 3f1eccd37282de91efd0575ee8e212af4bde39b1
Best regards,
--
Jens Axboe
^ permalink raw reply
* Re: [LSF/MM/BPF TOPIC] A block level, active-active replication solution
From: Hannes Reinecke @ 2026-06-03 1:07 UTC (permalink / raw)
To: Philipp Reisner, Haris Iqbal; +Cc: lsf-pc, linux-block, Jia Li
In-Reply-To: <CADGDV=USt71WBRAXXmwEERqQp0fpk7QPaOyrZjkGXpnY_yYSCQ@mail.gmail.com>
On 6/1/26 14:26, Philipp Reisner wrote:
> On Wed, May 27, 2026 at 2:16 PM Haris Iqbal <haris.iqbal@ionos.com> wrote:
>>
>> On Tue, May 5, 2026 at 11:20 AM Philipp Reisner
>> <philipp.reisner@linbit.com> wrote:
>>>
>>> Am Tue, Feb 03, 2026 at 04:09:59PM +0100 schrieb Haris Iqbal:
>>>> Hi Haris,
>>>>
>>>> We are working on a pair of kernel modules which would offer a new
>>>> replication solution in the Linux kernel. It would be a block level,
>>>> active-active replication solution for RDMA transport.
>>>>
>>>> The existing block level replication solution in the Linux kernel is
>>>> DRBD, which is an active-passive solution. The data replication in
>>>> DRBD happens through 2 network hops.
>>>>
>>>>
>>>> An active-active solution which one can build is by exporting block
>>>> devices, either through NVMeOF or RNBD/RTRS, over the network, and
>>>> then creating a raid1 device over it. It would provide a single hop
>>>> replication solution, but the synchronization during a degraded state
>>>> goes through 2 hops.
>>>>
>>>> The proposed solution would provide an active-active single hop
>>>> replication, and a single hop synchronization (directly between
>>>> storage nodes) in case of a degraded state.
>>> [...]
>>>
>>>
>>> I stumbled across this post because of the newer replies.
>>>
>>> I want to point out that we have significantly developed DRBD over the
>>> last 15 Years as an out-of-tree module. In the past months, we began
>>> the process of getting all those improvements back into Linux
>>> upstream.
>>>
>>> With that, DRBD9 became multi-node. It does the “active-active single
>>> hop replication” as it is. The networking part is now abstracted into
>>> transport modules. We have one for TCP, one for load balancing across
>>> multiple TCP connections, and one for RDMA.
>>>
>>> What you are doing here, in DRBD lingo, is a diskless primary
>>> connected to multiple storage nodes.
>>>
>>> Find everything here https://github.com/LINBIT.
>>> The latest edition of what we bring to the upstreaming discussion:
>>> https://github.com/LINBIT/linux-drbd/tree/drbd-next
>>
>> Hi Philipp,
>>
>> Interesting.
>> I looked into the diskless primary mode configuration for DRBD, and it
>> does look similar to what RMR/BRMR offers.
>> We plan to do comparison runs of DRBD diskless primary mode, and RMR/BRMR.
>>
>> I see the DRBD version in the current kernel is still 8.x.x.
>> Do you have an ETA by when can we have version 9 in the kernel?
>>
>
> Hi Haris,
>
> We are working on it. We are currently aligning the way we do things
> with Generic Netlink. We are aligning our code to follow the upstream
> conventions,
> and the .YML and code generator used nowadays in this area.
>
> If we are not discovering another area where similar cleanups are necessary,
> I expect that we will submit it in August.
>
A round of applause for that effort. We have been waiting for that for a
long time.
Thanks Philipp!
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* Re: [PATCH v6 06/14] block/blk-iocost: Combine two error paths in ioc_qos_write()
From: Hannes Reinecke @ 2026-06-03 1:46 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Tejun Heo, Josef Bacik
In-Reply-To: <fe85b8a3117486e3d1fe40b16d7c9621a070499a.1780419600.git.bvanassche@acm.org>
On 6/2/26 19:07, Bart Van Assche wrote:
> Reduce code duplication by combining two error paths. No functionality
> has been changed.
>
> Suggested-by: Christoph Hellwig <hch@lst.de>
> Cc: Tejun Heo <tj@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-iocost.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* Re: [PATCH v6 07/14] block/cgroup: Inline blkg_conf_{open,close}_bdev_frozen()
From: Hannes Reinecke @ 2026-06-03 1:47 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Tejun Heo, Josef Bacik
In-Reply-To: <08f0e3d5f8c01523a66cf55303acfa3c4ff47725.1780419600.git.bvanassche@acm.org>
On 6/2/26 19:07, Bart Van Assche wrote:
> The blkg_conf_open_bdev_frozen() calling convention is not compatible
> with lock context annotations. Fold both blkg_conf_open_bdev_frozen()
> and blkg_conf_close_bdev_frozen() into their only caller. This patch
> prepares for enabling lock context analysis.
>
> The type of 'memflags' has been changed from unsigned long into unsigned
> int to match the type of current->flags. See also <linux/sched.h>.
>
> Cc: Tejun Heo <tj@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-cgroup.c | 46 ----------------------------------------------
> block/blk-cgroup.h | 4 ----
> block/blk-iocost.c | 25 +++++++++++++++++++------
> 3 files changed, 19 insertions(+), 56 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* Re: [PATCH v6 09/14] block/blk-iocost: Split ioc_rqos_throttle()
From: Hannes Reinecke @ 2026-06-03 1:53 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Tejun Heo, Josef Bacik
In-Reply-To: <8037f3bb5f5ecbbd635ef48804a1a9ad2d2be2e0.1780419600.git.bvanassche@acm.org>
On 6/2/26 19:07, Bart Van Assche wrote:
> Prepare for inlining iocg_lock() and iocg_unlock() by moving the code
> between these two calls into a new function. No functionality has been
> changed.
>
> Cc: Tejun Heo <tj@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-iocost.c | 163 ++++++++++++++++++++++++++-------------------
> 1 file changed, 94 insertions(+), 69 deletions(-)
>
Not a big fan of the 'action' enum, but I don't see a better way here.
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* Re: [PATCH v6 11/14] block/blk-mq-debugfs: Improve lock context annotations
From: Hannes Reinecke @ 2026-06-03 1:58 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Nathan Chancellor
In-Reply-To: <66eff9a5e488fbe0b79db3a54518fe5d181a31ec.1780419600.git.bvanassche@acm.org>
On 6/2/26 19:07, Bart Van Assche wrote:
> Make the existing lock context annotations compatible with Clang. Add
> the lock context annotations that are missing.
>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-mq-debugfs.c | 24 ++++++++++++++++++------
> block/blk.h | 4 ++++
> 2 files changed, 22 insertions(+), 6 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* Re: [PATCH v6 10/14] block/blk-iocost: Inline iocg_lock() and iocg_unlock()
From: Hannes Reinecke @ 2026-06-03 1:59 UTC (permalink / raw)
To: Bart Van Assche, Jens Axboe
Cc: linux-block, Christoph Hellwig, Hannes Reinecke, Damien Le Moal,
Tejun Heo, Josef Bacik
In-Reply-To: <e0e81fbd746bfb54c31cb84f8671a0c23dcdedb2.1780419600.git.bvanassche@acm.org>
On 6/2/26 19:07, Bart Van Assche wrote:
> Both iocg_lock() and iocg_unlock() use conditional locking. Fold these
> functions into their callers such that unlocking becomes unconditional.
>
> Cc: Tejun Heo <tj@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
> block/blk-iocost.c | 49 +++++++++++++++++++---------------------------
> 1 file changed, 20 insertions(+), 29 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply
* [PATCH] block/fops: fix refcount underflow in __blkdev_direct_IO()
From: Wentao Liang @ 2026-06-03 2:10 UTC (permalink / raw)
To: axboe; +Cc: linux-block, linux-kernel, Wentao Liang, stable
__blkdev_direct_IO() calls bio_get() and bio_put() around
I/O operations, but if the I/O fails, the error path may call
bio_put() twice, causing a refcount underflow.
Fix this by moving the bio_put() call from the error path into
the cleanup section that is only executed once, regardless of
whether the I/O succeeded or failed.
Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer")
Cc: stable@vger.kernel.org
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
block/fops.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..4dd1e40c7d4e 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -286,6 +286,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_release_pages(bio, false);
bio_clear_flag(bio, BIO_REFFED);
bio_put(bio);
+ if (bio == &dio->bio)
+ bio_put(bio);
blk_finish_plug(&plug);
return ret;
}
--
2.34.1
^ permalink raw reply related
* Re: bio_copy_from_iter
From: Matthew Wilcox @ 2026-06-03 4:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Jens Axboe, linux-block
In-Reply-To: <20260602060319.GA325@lst.de>
On Tue, Jun 02, 2026 at 08:03:19AM +0200, Christoph Hellwig wrote:
> On Mon, Jun 01, 2026 at 06:10:18PM +0100, Matthew Wilcox wrote:
> > I'd like to remove copy_page_to_iter() (and only have
> > copy_folio_to_iter()). That led me to looking at bio_copy_to_iter().
> > At first glance, switching it to bio_for_each_folio_all() makes a lot of
> > sense -- if there are large folios involved, then we can copy an entire
> > folio at a time instead of a page.
> >
> > But what I can't prove to my satisfaction is that every bio passed to
> > bio_copy_to_iter() necessarily contains folios. That's not currently
> > necessary, but will become necessary in the future. [1].
>
> bio_copy_to_iter is only called from blk_rq_unmap_user, which is only
> used for bios created blk_rq_map_user_iov and only used when they had to
> be bounce buffer. There's two sources of pages for the bounce buffering:
> The allocation in bio_copy_user_iov using alloca_page, and whatever sg
> and st pass in through struct rq_map_data.
I must have misread the code. I thought we could get to bio_map_kern()
from blk_rq_map_user_iov(), but on reviewing it now, I don't see a way.
> A good step to be able
> to validate this would be to kill the mess around struct rq_map_data,
> as in removing that structure. There's no good reason why these
> drivers should do their own allocations, this has mostly been
> grandfathered in.
Yes, that doesn't look too bad. Just sg and st using it.
> > [1] I believe all these bvecs are constructed using
> > blk_rq_map_user_iov() which can end up calling bio_add_vmalloc(),
> > and vmalloc pages will not be folios.
>
> blk_rq_map_user_iov can't call bio_add_vmalloc. And if you want to make
> the vmalloc backing not folios you will be in a huge world of pain
> anyway, as we expect to back folios using vmap/vm_map_ram and treating
> vmalloc different from this will be extremely messy and invasive.
Assuming you inadvertently transposed two words and meant to write "we
expect to back vmap/vm_map_ram using folios", I do intend to keep that
working, but I also do not intend to allocate folios in order to back
vmalloc areas. I don't think it'll be all that messy, but I haven't
worked through the details of it yet.
^ permalink raw reply
* [PATCH v2] block: assign caller-specific lockdep class to disk->open_mutex
From: Tetsuo Handa @ 2026-06-03 6:25 UTC (permalink / raw)
To: Christoph Hellwig, Bart Van Assche, Jens Axboe
Cc: linux-block, LKML, Andrew Morton, Ming Lei, Damien Le Moal,
Qu Wenruo, Hillf Danton, Miguel Ojeda
In-Reply-To: <20260601071113.GB7468@lst.de>
The block core currently allocates a single monolithic lockdep key for
disk->open_mutex across all callers. This single key conflates locking
hierarchies between independent block streams. For example, if a stacked
driver like loop flushes its internal workqueues inside lo_release() while
holding its own open_mutex, lockdep views this as a potential ABBA deadlock
against the underlying storage stack, leading to numerous circular
dependency splats.
To structurally reduce false positives, this patch splits the global
monolithic lock class into distinct, per-caller instances during disk
allocation. This is done by replacing "struct lock_class_key" with
"struct gendisk_lkclass", which contains two instances of
"struct lock_class_key" for the legacy "(bio completion)" map and
disk->open_mutex respectively.
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
Changes in v2:
- Replaced a two-element array with a struct with two named members, suggested by Bart Van Assche
( https://lore.kernel.org/all/4cf7ecc7-932c-4589-9d0f-3e025e83e27c@acm.org/ ).
- Added changes needed by Rust block layer bindings and rnull module, pointed out by sashiko
( https://sashiko.dev/#/patchset/147ed056-03d9-4214-b925-0f10fc00cf27%40I-love.SAKURA.ne.jp ).
Testing result of v1:
- I kept v1 patch in linux-next for several more days, but result was that
some of circular dependency splats which I thought this change succeeded to
eliminate are still getting reported. That is, we need to determine whether
we should make this change without example syzbot reports that demonstrates
difference. But in general, keeping locking chains simpler and shorter
should be a good change.
Acknowledgment:
Since I have no experience with Rust, changes needed by Rust block layer
bindings and rnull module are made based on conversation with the Gemini
AI collaborator.
block/blk-mq.c | 4 ++--
block/blk.h | 2 +-
block/genhd.c | 8 ++++----
drivers/block/rnull/rnull.rs | 2 +-
drivers/scsi/sd.c | 2 +-
drivers/scsi/sr.c | 2 +-
include/linux/blk-mq.h | 6 +++---
include/linux/blkdev.h | 9 +++++++--
rust/kernel/block/mq/gen_disk.rs | 17 +++++++++++++++--
9 files changed, 35 insertions(+), 17 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a24175441380..5203e8cc6a28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4492,7 +4492,7 @@ EXPORT_SYMBOL(blk_mq_destroy_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
@@ -4513,7 +4513,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
EXPORT_SYMBOL(__blk_mq_alloc_disk);
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct gendisk *disk;
diff --git a/block/blk.h b/block/blk.h
index b998a7761faf..611bcd655357 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -614,7 +614,7 @@ void drop_partition(struct block_device *part);
void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e9..8f4a3d8ca15e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1444,7 +1444,7 @@ dev_t part_devt(struct gendisk *disk, u8 partno)
}
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct gendisk *disk;
@@ -1467,7 +1467,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
goto out_free_bdi;
disk->node_id = node_id;
- mutex_init(&disk->open_mutex);
+ mutex_init_with_key(&disk->open_mutex, &lkclass->open_mutex_lkclass);
xa_init(&disk->part_tbl);
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
@@ -1482,7 +1482,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
device_initialize(disk_to_dev(disk));
inc_diskseq(disk);
q->disk = disk;
- lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
+ lockdep_init_map(&disk->lockdep_map, "(bio completion)", &lkclass->bio_lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
@@ -1506,7 +1506,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
}
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct queue_limits default_lim = { };
struct request_queue *q;
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
index 0ca8715febe8..476a8910c432 100644
--- a/drivers/block/rnull/rnull.rs
+++ b/drivers/block/rnull/rnull.rs
@@ -61,7 +61,7 @@ fn new(
.logical_block_size(block_size)?
.physical_block_size(block_size)?
.rotational(rotational)
- .build(fmt!("{}", name.to_str()?), tagset, queue_data)
+ .build(fmt!("{}", name.to_str()?), tagset, queue_data, kernel::my_gendisk_lkclass!())
}
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 599e75f33334..63fe8c86606a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -112,7 +112,7 @@ static DEFINE_MUTEX(sd_mutex_lock);
static mempool_t *sd_page_pool;
static mempool_t *sd_large_page_pool;
static atomic_t sd_large_page_pool_users = ATOMIC_INIT(0);
-static struct lock_class_key sd_bio_compl_lkclass;
+static struct gendisk_lkclass sd_bio_compl_lkclass;
static const char *sd_cache_types[] = {
"write through", "none", "write back",
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index c36c54ecd354..734567ae0e43 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -106,7 +106,7 @@ static struct scsi_driver sr_template = {
static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG];
static DEFINE_SPINLOCK(sr_index_lock);
-static struct lock_class_key sr_bio_compl_lkclass;
+static struct gendisk_lkclass sr_bio_compl_lkclass;
static int sr_open(struct cdrom_device_info *, int);
static void sr_release(struct cdrom_device_info *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..5aa17e82c3ba 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -726,15 +726,15 @@ enum {
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
#define blk_mq_alloc_disk(set, lim, queuedata) \
({ \
- static struct lock_class_key __key; \
+ static struct gendisk_lkclass __key; \
\
__blk_mq_alloc_disk(set, lim, queuedata, &__key); \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..28b0aee6b3ba 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -49,6 +49,11 @@ extern const struct device_type disk_type;
extern const struct device_type part_type;
extern const struct class block_class;
+struct gendisk_lkclass {
+ struct lock_class_key bio_lkclass;
+ struct lock_class_key open_mutex_lkclass;
+};
+
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
@@ -974,7 +979,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate);
void put_disk(struct gendisk *disk);
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
/**
* blk_alloc_disk - allocate a gendisk structure
@@ -990,7 +995,7 @@ struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
*/
#define blk_alloc_disk(lim, node_id) \
({ \
- static struct lock_class_key __key; \
+ static struct gendisk_lkclass __key; \
\
__blk_alloc_disk(lim, node_id, &__key); \
})
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 912cb805caf5..b4efe4b1b5f6 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -11,7 +11,6 @@
error::{self, from_err_ptr, Result},
fmt::{self, Write},
prelude::*,
- static_lock_class,
str::NullTerminatedFormatter,
sync::Arc,
types::{ForeignOwnable, ScopeGuard},
@@ -38,6 +37,19 @@ fn default() -> Self {
}
}
+/// Helper macro to generate a unique caller-local static lock class struct
+#[macro_export]
+macro_rules! my_gendisk_lkclass {
+ () => {{
+ static mut LKCLASS: $crate::bindings::gendisk_lkclass = $crate::bindings::gendisk_lkclass {
+ bio_lkclass: const { unsafe { ::core::mem::zeroed() } },
+ open_mutex_lkclass: const { unsafe { ::core::mem::zeroed() } },
+ };
+
+ unsafe { &raw mut LKCLASS }
+ }};
+}
+
impl GenDiskBuilder {
/// Create a new instance.
pub fn new() -> Self {
@@ -100,6 +112,7 @@ pub fn build<T: Operations>(
name: fmt::Arguments<'_>,
tagset: Arc<TagSet<T>>,
queue_data: T::QueueData,
+ lkclass: *mut bindings::gendisk_lkclass,
) -> Result<GenDisk<T>> {
let data = queue_data.into_foreign();
let recover_data = ScopeGuard::new(|| {
@@ -121,7 +134,7 @@ pub fn build<T: Operations>(
tagset.raw_tag_set(),
&mut lim,
data,
- static_lock_class!().as_ptr(),
+ lkclass,
)
})?;
--
2.54.0
^ permalink raw reply related
* [syzbot ci] Re: block/fops: fix refcount underflow in __blkdev_direct_IO()
From: syzbot ci @ 2026-06-03 6:41 UTC (permalink / raw)
To: axboe, linux-block, linux-kernel, stable, vulab; +Cc: syzbot, syzkaller-bugs
In-Reply-To: <20260603021035.3690601-1-vulab@iscas.ac.cn>
syzbot ci has tested the following series
[v1] block/fops: fix refcount underflow in __blkdev_direct_IO()
https://lore.kernel.org/all/20260603021035.3690601-1-vulab@iscas.ac.cn
* [PATCH] block/fops: fix refcount underflow in __blkdev_direct_IO()
and found the following issue:
KASAN: invalid-free in mempool_free
Full report is available here:
https://ci.syzbot.org/series/3498c893-003a-4780-92e5-c3090ee3fe45
***
KASAN: invalid-free in mempool_free
tree: torvalds
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base: ba3e43a9e601636f5edb54e259a74f96ca3b8fd8
arch: amd64
compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config: https://ci.syzbot.org/builds/0616259e-7b19-4982-aeee-0a50317c4cc6/config
syz repro: https://ci.syzbot.org/findings/d2d98b4c-c6a9-4ecb-b313-ebd5a6d1b0d8/syz_repro
==================================================================
BUG: KASAN: double-free in mempool_free+0xec/0x130 mm/mempool.c:711
Free of addr ffff888177994400 by task syz.0.17/5875
CPU: 1 UID: 0 PID: 5875 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
print_address_description+0x55/0x1e0 mm/kasan/report.c:378
print_report+0x58/0x70 mm/kasan/report.c:482
kasan_report_invalid_free+0xea/0x110 mm/kasan/report.c:557
check_slab_allocation mm/kasan/common.c:-1 [inline]
__kasan_slab_pre_free+0x104/0x120 mm/kasan/common.c:261
kasan_slab_pre_free include/linux/kasan.h:199 [inline]
slab_free_hook mm/slub.c:2634 [inline]
slab_free mm/slub.c:6251 [inline]
kmem_cache_free+0x130/0x650 mm/slub.c:6378
mempool_free+0xec/0x130 mm/mempool.c:711
bio_free+0x1e9/0x330 block/bio.c:205
__blkdev_direct_IO+0xd89/0xf40 block/fops.c:290
blkdev_direct_IO+0x121a/0x1790 block/fops.c:438
blkdev_read_iter+0x23d/0x440 block/fops.c:841
do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1
vfs_readv+0x288/0x840 fs/read_write.c:1020
do_preadv fs/read_write.c:1134 [inline]
__do_sys_preadv2 fs/read_write.c:1193 [inline]
__se_sys_preadv2+0x184/0x2a0 fs/read_write.c:1184
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f125c99ce59
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f125d829028 EFLAGS: 00000246 ORIG_RAX: 0000000000000147
RAX: ffffffffffffffda RBX: 00007f125cc15fa0 RCX: 00007f125c99ce59
RDX: 0000000000000005 RSI: 0000200000000080 RDI: 0000000000000003
RBP: 00007f125ca32d6f R08: 0000000000000000 R09: 000000000000001f
R10: 0000000000002000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f125cc16038 R14: 00007f125cc15fa0 R15: 00007fff02d2e3e8
</TASK>
Allocated by task 5875:
kasan_save_stack mm/kasan/common.c:57 [inline]
kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
unpoison_slab_object mm/kasan/common.c:340 [inline]
__kasan_slab_alloc+0x6c/0x80 mm/kasan/common.c:366
kasan_slab_alloc include/linux/kasan.h:253 [inline]
slab_post_alloc_hook mm/slub.c:4570 [inline]
slab_alloc_node mm/slub.c:4899 [inline]
kmem_cache_alloc_noprof+0x2bc/0x650 mm/slub.c:4906
bio_alloc_bioset+0x599/0xc90 block/bio.c:571
__blkdev_direct_IO+0x294/0xf40 block/fops.c:186
blkdev_direct_IO+0x121a/0x1790 block/fops.c:438
blkdev_read_iter+0x23d/0x440 block/fops.c:841
do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1
vfs_readv+0x288/0x840 fs/read_write.c:1020
do_preadv fs/read_write.c:1134 [inline]
__do_sys_preadv2 fs/read_write.c:1193 [inline]
__se_sys_preadv2+0x184/0x2a0 fs/read_write.c:1184
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
Freed by task 5875:
kasan_save_stack mm/kasan/common.c:57 [inline]
kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584
poison_slab_object mm/kasan/common.c:253 [inline]
__kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285
kasan_slab_free include/linux/kasan.h:235 [inline]
slab_free_hook mm/slub.c:2689 [inline]
slab_free mm/slub.c:6251 [inline]
kmem_cache_free+0x182/0x650 mm/slub.c:6378
mempool_free+0xec/0x130 mm/mempool.c:711
bio_free+0x1e9/0x330 block/bio.c:205
__blkdev_direct_IO+0xd6e/0xf40 block/fops.c:288
blkdev_direct_IO+0x121a/0x1790 block/fops.c:438
blkdev_read_iter+0x23d/0x440 block/fops.c:841
do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1
vfs_readv+0x288/0x840 fs/read_write.c:1020
do_preadv fs/read_write.c:1134 [inline]
__do_sys_preadv2 fs/read_write.c:1193 [inline]
__se_sys_preadv2+0x184/0x2a0 fs/read_write.c:1184
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
The buggy address belongs to the object at ffff888177994400
which belongs to the cache biovec-max of size 4096
The buggy address is located 0 bytes inside of
4096-byte region [ffff888177994400, ffff888177995400)
The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888177990000 pfn:0x177990
head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0x57ff00000000240(workingset|head|node=1|zone=2|lastcpupid=0x7ff)
page_type: f5(slab)
raw: 057ff00000000240 ffff8881604148c0 ffff8881622adc48 ffffea0005de5010
raw: ffff888177990000 0000000800070003 00000000f5000000 0000000000000000
head: 057ff00000000240 ffff8881604148c0 ffff8881622adc48 ffffea0005de5010
head: ffff888177990000 0000000800070003 00000000f5000000 0000000000000000
head: 057ff00000000003 fffffffffffffe01 00000000ffffffff 00000000ffffffff
head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000008
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd2800(GFP_NOWAIT|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 5065, tgid 5065 (udevd), ts 29098398773, free_ts 24434603765
set_page_owner include/linux/page_owner.h:32 [inline]
post_alloc_hook+0x22d/0x280 mm/page_alloc.c:1853
prep_new_page mm/page_alloc.c:1861 [inline]
get_page_from_freelist+0x2593/0x2610 mm/page_alloc.c:3941
__alloc_frozen_pages_noprof+0x18d/0x380 mm/page_alloc.c:5221
alloc_slab_page mm/slub.c:3278 [inline]
allocate_slab+0x77/0x660 mm/slub.c:3467
new_slab mm/slub.c:3525 [inline]
refill_objects+0x339/0x3d0 mm/slub.c:7272
refill_sheaf mm/slub.c:2816 [inline]
__pcs_replace_empty_main+0x321/0x720 mm/slub.c:4652
alloc_from_pcs mm/slub.c:4750 [inline]
slab_alloc_node mm/slub.c:4884 [inline]
kmem_cache_alloc_noprof+0x37d/0x650 mm/slub.c:4906
bio_alloc_bioset+0x599/0xc90 block/bio.c:571
bio_alloc include/linux/bio.h:367 [inline]
ext4_mpage_readpages+0x13b5/0x1f30 fs/ext4/readpage.c:355
read_pages+0x193/0x5a0 mm/readahead.c:163
page_cache_ra_unbounded+0x794/0xa10 mm/readahead.c:304
do_page_cache_ra mm/readahead.c:334 [inline]
page_cache_ra_order+0xae4/0xe80 mm/readahead.c:538
filemap_readahead mm/filemap.c:2664 [inline]
filemap_get_pages+0x897/0x1ef0 mm/filemap.c:2710
filemap_read+0x447/0x1230 mm/filemap.c:2806
__kernel_read+0x504/0x9b0 fs/read_write.c:532
integrity_kernel_read+0x89/0xd0 security/integrity/iint.c:28
page last free pid 10 tgid 10 stack trace:
reset_page_owner include/linux/page_owner.h:25 [inline]
__free_pages_prepare mm/page_alloc.c:1397 [inline]
__free_frozen_pages+0xc1c/0xd30 mm/page_alloc.c:2938
vfree+0x1d1/0x2f0 mm/vmalloc.c:3472
delayed_vfree_work+0x55/0x80 mm/vmalloc.c:3392
process_one_work kernel/workqueue.c:3314 [inline]
process_scheduled_works+0xb5d/0x1860 kernel/workqueue.c:3397
worker_thread+0xa53/0xfc0 kernel/workqueue.c:3478
kthread+0x389/0x470 kernel/kthread.c:436
ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
Memory state around the buggy address:
ffff888177994300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff888177994380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff888177994400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff888177994480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff888177994500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).
The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.
^ permalink raw reply
* [syzbot ci] Re: fs: support freeze/thaw/mark_dead/sync with shared devices
From: syzbot ci @ 2026-06-03 6:43 UTC (permalink / raw)
To: axboe, brauner, cem, clm, dsterba, hch, jack, linux-block,
linux-btrfs, linux-erofs, linux-ext4, linux-fsdevel, linux-kernel,
linux-xfs, tytso, viro, xiang
Cc: syzbot, syzkaller-bugs
In-Reply-To: <20260602-work-super-bdev_holder_global-v1-0-bb0fd82f3861@kernel.org>
syzbot ci has tested the following series
[v1] fs: support freeze/thaw/mark_dead/sync with shared devices
https://lore.kernel.org/all/20260602-work-super-bdev_holder_global-v1-0-bb0fd82f3861@kernel.org
* [PATCH RFC 1/8] fs, block: move blk_mode_t and fop_flags_t into <linux/types.h>
* [PATCH RFC 2/8] fs: add a global device to super block hash table
* [PATCH RFC 3/8] fs: refuse to claim any frozen block device
* [PATCH RFC 4/8] xfs: port to fs_bdev_file_open_by_path()
* [PATCH RFC 5/8] btrfs: open via dedicated fs bdev helpers
* [PATCH RFC 6/8] ext4: open via dedicated fs bdev helpers
* [PATCH RFC 7/8] erofs: open via dedicated fs bdev helpers
* [PATCH RFC 8/8] super: make fs_holder_ops private
and found the following issue:
general protection fault in close_fs_devices
Full report is available here:
https://ci.syzbot.org/series/9511f00a-a3c2-44ab-9a0b-2d65de5bbd49
***
general protection fault in close_fs_devices
tree: bpf-next
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git
base: 254f49634ee16a731174d2ae34bc50bd5f45e731
arch: amd64
compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config: https://ci.syzbot.org/builds/4af26755-5773-453e-807d-ee451d2fdec5/config
syz repro: https://ci.syzbot.org/findings/2d8d96f7-d133-47dc-b4ca-5c0c65e1b6c9/syz_repro
btrfs: Deprecated parameter 'usebackuproot'
BTRFS warning: 'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead
BTRFS: device fsid ed167579-eb65-4e76-9a50-61ac97e9b59d devid 1281 transid 8 /dev/loop1 (7:1) scanned by syz.1.18 (5863)
Oops: general protection fault, probably for non-canonical address 0xdffffc00000000f8: 0000 [#1] SMP KASAN PTI
KASAN: null-ptr-deref in range [0x00000000000007c0-0x00000000000007c7]
CPU: 1 UID: 0 PID: 5863 Comm: syz.1.18 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:btrfs_close_bdev fs/btrfs/volumes.c:1140 [inline]
RIP: 0010:btrfs_close_one_device fs/btrfs/volumes.c:1161 [inline]
RIP: 0010:close_fs_devices+0x47c/0x860 fs/btrfs/volumes.c:1204
Code: 3c 08 00 74 08 48 89 ef e8 b1 95 38 fe 48 8b 6d 00 b8 c0 07 00 00 48 01 c5 48 89 e8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 48 89 ef e8 86 95 38 fe 48 8b 75 00 4c 89 ff e8
RSP: 0018:ffffc90004007a48 EFLAGS: 00010202
RAX: 00000000000000f8 RBX: 1ffff110368c440b RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 00000000000007c0 R08: ffff8881b462206f R09: 1ffff110368c440d
R10: dffffc0000000000 R11: ffffed10368c440e R12: ffff8881b4622000
R13: ffff8881b4622068 R14: ffff8881b4622058 R15: ffff8881707b7a00
FS: 00007f849d6ce6c0(0000) GS:ffff8882a9292000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f849c786a00 CR3: 00000001bbbcc000 CR4: 00000000000006f0
Call Trace:
<TASK>
btrfs_close_devices+0xcd/0x570 fs/btrfs/volumes.c:1219
btrfs_free_fs_info+0x4f/0x360 fs/btrfs/disk-io.c:1205
deactivate_locked_super+0xbc/0x130 fs/super.c:477
btrfs_get_tree_super fs/btrfs/super.c:-1 [inline]
btrfs_get_tree_subvol fs/btrfs/super.c:2087 [inline]
btrfs_get_tree+0xca6/0x1910 fs/btrfs/super.c:2121
vfs_get_tree+0x92/0x2a0 fs/super.c:1928
fc_mount fs/namespace.c:1193 [inline]
do_new_mount_fc fs/namespace.c:3758 [inline]
do_new_mount+0x341/0xd30 fs/namespace.c:3834
do_mount fs/namespace.c:4167 [inline]
__do_sys_mount fs/namespace.c:4383 [inline]
__se_sys_mount+0x31d/0x420 fs/namespace.c:4360
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x15f/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f849c79e0ca
Code: 48 c7 c2 e8 ff ff ff f7 d8 64 89 02 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f849d6cde58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 00007f849d6cdee0 RCX: 00007f849c79e0ca
RDX: 00002000000055c0 RSI: 0000200000000340 RDI: 00007f849d6cdea0
RBP: 00002000000055c0 R08: 00007f849d6cdee0 R09: 0000000000000408
R10: 0000000000000408 R11: 0000000000000246 R12: 0000200000000340
R13: 00007f849d6cdea0 R14: 00000000000055f5 R15: 0000200000000380
</TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:btrfs_close_bdev fs/btrfs/volumes.c:1140 [inline]
RIP: 0010:btrfs_close_one_device fs/btrfs/volumes.c:1161 [inline]
RIP: 0010:close_fs_devices+0x47c/0x860 fs/btrfs/volumes.c:1204
Code: 3c 08 00 74 08 48 89 ef e8 b1 95 38 fe 48 8b 6d 00 b8 c0 07 00 00 48 01 c5 48 89 e8 48 c1 e8 03 48 b9 00 00 00 00 00 fc ff df <80> 3c 08 00 74 08 48 89 ef e8 86 95 38 fe 48 8b 75 00 4c 89 ff e8
RSP: 0018:ffffc90004007a48 EFLAGS: 00010202
RAX: 00000000000000f8 RBX: 1ffff110368c440b RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 00000000000007c0 R08: ffff8881b462206f R09: 1ffff110368c440d
R10: dffffc0000000000 R11: ffffed10368c440e R12: ffff8881b4622000
R13: ffff8881b4622068 R14: ffff8881b4622058 R15: ffff8881707b7a00
FS: 00007f849d6ce6c0(0000) GS:ffff8882a9292000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000557941c2b058 CR3: 00000001bbbcc000 CR4: 00000000000006f0
----------------
Code disassembly (best guess):
0: 3c 08 cmp $0x8,%al
2: 00 74 08 48 add %dh,0x48(%rax,%rcx,1)
6: 89 ef mov %ebp,%edi
8: e8 b1 95 38 fe call 0xfe3895be
d: 48 8b 6d 00 mov 0x0(%rbp),%rbp
11: b8 c0 07 00 00 mov $0x7c0,%eax
16: 48 01 c5 add %rax,%rbp
19: 48 89 e8 mov %rbp,%rax
1c: 48 c1 e8 03 shr $0x3,%rax
20: 48 b9 00 00 00 00 00 movabs $0xdffffc0000000000,%rcx
27: fc ff df
* 2a: 80 3c 08 00 cmpb $0x0,(%rax,%rcx,1) <-- trapping instruction
2e: 74 08 je 0x38
30: 48 89 ef mov %rbp,%rdi
33: e8 86 95 38 fe call 0xfe3895be
38: 48 8b 75 00 mov 0x0(%rbp),%rsi
3c: 4c 89 ff mov %r15,%rdi
3f: e8 .byte 0xe8
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).
The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.
^ permalink raw reply
* Re: [PATCH] mm: simplify the mempool_alloc_bulk API
From: Vlastimil Babka (SUSE) @ 2026-06-03 9:41 UTC (permalink / raw)
To: Christoph Hellwig, harry, akpm
Cc: hao.li, cl, rientjes, roman.gushchin, linux-block, linux-mm
In-Reply-To: <20260602160038.3976341-1-hch@lst.de>
On 6/2/26 18:00, Christoph Hellwig wrote:
> The mempool_alloc_bulk was modelled after the alloc_pages_bulk API,
> including some misunderstanding of it.
>
> Remove checking for NULL slots in the array, as alloc_pages_bulk and
> kmem_cache_alloc_bulk always fill the array from the beginning and thus
> we know the offset of the first failing allocation. This removes support
> for working well with alloc_pages_bulk used to refill page arrays that
> might have an entry removed from in the middle, but that is only used by
> sunrpc and hopefully on it's way out.
Importantly sunrpc doesn't use mempool_alloc_bulk() so we're not affecting
it here.
> Also remove the allocated parameter as it is redundant because the caller
> can simply specific and offset into the entries array.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Somewhat late, but small and affects only one caller. So applied to
slab/for-7.2/alloc_bulk on top of the kmem_cache_alloc_bulk() change.
> ---
> block/blk-crypto-fallback.c | 9 +++++----
> include/linux/mempool.h | 2 +-
> mm/mempool.c | 27 ++++++++++-----------------
> 3 files changed, 16 insertions(+), 22 deletions(-)
>
> diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
> index 61f595410832..ab6924fba280 100644
> --- a/block/blk-crypto-fallback.c
> +++ b/block/blk-crypto-fallback.c
> @@ -199,8 +199,8 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
> pages += nr_segs * (PAGE_PTRS_PER_BVEC - 1);
>
> /*
> - * Try a bulk allocation first. This could leave random pages in the
> - * array unallocated, but we'll fix that up later in mempool_alloc_bulk.
> + * Try a bulk allocation first. This might not fill all allocated
> + * pages, but we'll fix that up later in mempool_alloc_bulk.
> *
> * Note: alloc_pages_bulk needs the array to be zeroed, as it assumes
> * any non-zero slot already contains a valid allocation.
> @@ -208,8 +208,9 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
> memset(pages, 0, sizeof(struct page *) * nr_segs);
> nr_allocated = alloc_pages_bulk(GFP_KERNEL, nr_segs, pages);
> if (nr_allocated < nr_segs)
> - mempool_alloc_bulk(blk_crypto_bounce_page_pool, (void **)pages,
> - nr_segs, nr_allocated);
> + mempool_alloc_bulk(blk_crypto_bounce_page_pool,
> + (void **)pages + nr_allocated,
> + nr_segs - nr_allocated);
> memalloc_noio_restore(memflags);
> *pages_ret = pages;
> return bio;
> diff --git a/include/linux/mempool.h b/include/linux/mempool.h
> index e8e440e04a06..a0fa6d43e0dc 100644
> --- a/include/linux/mempool.h
> +++ b/include/linux/mempool.h
> @@ -66,7 +66,7 @@ void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc;
> #define mempool_alloc(...) \
> alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
> int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
> - unsigned int count, unsigned int allocated);
> + unsigned int count);
> #define mempool_alloc_bulk(...) \
> alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__))
>
> diff --git a/mm/mempool.c b/mm/mempool.c
> index db23e0eef652..473a029fa31f 100644
> --- a/mm/mempool.c
> +++ b/mm/mempool.c
> @@ -419,12 +419,8 @@ static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
> spin_lock_irqsave(&pool->lock, flags);
> if (unlikely(pool->curr_nr < count - allocated))
> goto fail;
> - for (i = 0; i < count; i++) {
> - if (!elems[i]) {
> - elems[i] = remove_element(pool);
> - allocated++;
> - }
> - }
> + while (allocated < count)
> + elems[allocated++] = remove_element(pool);
> spin_unlock_irqrestore(&pool->lock, flags);
>
> /* Paired with rmb in mempool_free(), read comment there. */
> @@ -479,22 +475,21 @@ static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
> * @pool: pointer to the memory pool
> * @elems: partially or fully populated elements array
> * @count: number of entries in @elem that need to be allocated
> - * @allocated: number of entries in @elem already allocated
> *
> - * Allocate elements for each slot in @elem that is non-%NULL. This is done by
> - * first calling into the alloc_fn supplied at pool initialization time, and
> - * dipping into the reserved pool when alloc_fn fails to allocate an element.
> + * Allocate @count elements into @elems. This is done by first calling into the
> + * alloc_fn supplied at pool initialization time, and dipping into the reserved
> + * pool when alloc_fn fails to allocate an element.
> *
> * On return all @count elements in @elems will be populated.
> *
> * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void.
> */
> int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
> - unsigned int count, unsigned int allocated)
> + unsigned int count)
> {
> gfp_t gfp_mask = GFP_KERNEL;
> gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
> - unsigned int i = 0;
> + unsigned int allocated = 0;
>
> VM_WARN_ON_ONCE(count > pool->min_nr);
> might_alloc(gfp_mask);
> @@ -514,11 +509,9 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
> * Try to allocate the elements using the allocation callback first as
> * that might succeed even when the caller's bulk allocation did not.
> */
> - for (i = 0; i < count; i++) {
> - if (elems[i])
> - continue;
> - elems[i] = pool->alloc(gfp_temp, pool->pool_data);
> - if (unlikely(!elems[i]))
> + while (allocated < count) {
> + elems[allocated] = pool->alloc(gfp_temp, pool->pool_data);
> + if (unlikely(!elems[allocated]))
> goto use_pool;
> allocated++;
> }
^ permalink raw reply
* Re: [PATCH 28/79] block: rnull: add partial I/O support for bad blocks
From: Andreas Hindborg @ 2026-06-03 9:44 UTC (permalink / raw)
To: Alice Ryhl
Cc: Boqun Feng, Jens Axboe, Miguel Ojeda, Gary Guo,
Björn Roy Baron, Benno Lossin, Trevor Gross,
Danilo Krummrich, FUJITA Tomonori, Frederic Weisbecker,
Lyude Paul, Thomas Gleixner, Anna-Maria Behnsen, John Stultz,
Stephen Boyd, Lorenzo Stoakes, Liam R. Howlett, linux-block,
rust-for-linux, linux-kernel, linux-mm
In-Reply-To: <abf0FHWkQzbnOtqD@google.com>
Alice Ryhl <aliceryhl@google.com> writes:
> On Mon, Feb 16, 2026 at 12:35:15AM +0100, Andreas Hindborg wrote:
<cut>
>> +fn is_power_of_two<T>(value: T) -> bool
>> +where
>> + T: core::ops::Sub<T, Output = T>,
>> + T: core::ops::BitAnd<Output = T>,
>> + T: core::cmp::PartialOrd<T>,
>> + T: Copy,
>> + T: From<u8>,
>> +{
>> + (value > 0u8.into()) && (value & (value - 1u8.into())) == 0u8.into()
>> +}
>
> This is in the standard library.
But there is no trait for this, the method is defined for concrete
integer types. I would still need to define a trait and macro implement
it for all integers.
We could do this in `kernel::num`?
Best regards,
Andreas Hindborg
^ permalink raw reply
* Re: [PATCH 24/79] block: rust: add `BadBlocks` for bad block tracking
From: Andreas Hindborg @ 2026-06-03 10:15 UTC (permalink / raw)
To: Alice Ryhl
Cc: Boqun Feng, Jens Axboe, Miguel Ojeda, Gary Guo,
Björn Roy Baron, Benno Lossin, Trevor Gross,
Danilo Krummrich, FUJITA Tomonori, Frederic Weisbecker,
Lyude Paul, Thomas Gleixner, Anna-Maria Behnsen, John Stultz,
Stephen Boyd, Lorenzo Stoakes, Liam R. Howlett, linux-block,
rust-for-linux, linux-kernel, linux-mm
In-Reply-To: <abfvrh0ObKhtsR8I@google.com>
Alice Ryhl <aliceryhl@google.com> writes:
> On Mon, Feb 16, 2026 at 12:35:11AM +0100, Andreas Hindborg wrote:
>> Add a safe Rust wrapper around the Linux kernel's badblocks infrastructure
>> to track and manage defective sectors on block devices. The BadBlocks type
>> provides methods to:
>>
>> - Mark sectors as bad or good (set_bad/set_good)
>> - Check if sector ranges contain bad blocks (check)
>> - Automatically handle memory management with PinnedDrop
>>
>> The implementation includes comprehensive documentation with examples for
>> block device drivers that need to avoid known bad sectors to maintain
>> data integrity. Bad blocks information is used by device drivers,
>> filesystem layers, and device management tools.
>>
>> Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
>
>> diff --git a/rust/kernel/block/badblocks.rs b/rust/kernel/block/badblocks.rs
>> new file mode 100644
>> index 0000000000000..a5fe0fde2e755
>> --- /dev/null
>> +++ b/rust/kernel/block/badblocks.rs
>> @@ -0,0 +1,721 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! Bad blocks tracking for block devices.
>> +//!
>> +//! This module provides a safe Rust wrapper around the badblocks
>> +//! infrastructure, which is used to track and manage bad sectors on block
>> +//! devices. Bad blocks are sectors that cannot reliably store data and should
>> +//! be avoided during I/O operations.
>
> Could use a srctree link to badblocks.h here.
Will add.
>
>> +/// # Examples
>> +///
>> +/// Basic usage:
>> +///
>> +/// ```rust
>> +/// # use kernel::block::badblocks::{BadBlocks, BlockStatus};
>> +/// # use kernel::prelude::*;
>> +///
>
> Unhide the imports or remove this empty line.
Ok.
>
>> +/// // Create a new bad blocks tracker
>> +/// let bad_blocks = KBox::pin_init(BadBlocks::new(true), GFP_KERNEL)?;
>> +///
>> +/// // Mark sectors 100-109 as bad (unacknowledged)
>> +/// bad_blocks.set_bad(100..110, false)?;
>> +///
>> +/// // Check if sector range 95-104 contains bad blocks
>> +/// match bad_blocks.check(95..105) {
>> +/// BlockStatus::None => pr_info!("No bad blocks found"),
>> +/// BlockStatus::Acknowledged(range) => pr_warn!("Acknowledged bad blocks: {:?}", range),
>> +/// BlockStatus::Unacknowledged(range) => pr_err!("Unacknowledged bad blocks: {:?}", range),
>> +/// }
>> +/// # Ok::<(), kernel::error::Error>(())
>> +/// ```
>> +/// # Invariants
>> +///
>> +/// - `self.blocks` is a valid `bindings::badblocks` struct.
>> +#[pin_data(PinnedDrop)]
>> +pub struct BadBlocks {
>> + #[pin]
>> + blocks: Opaque<bindings::badblocks>,
>> +}
>> +
>> +impl BadBlocks {
>> + /// Creates a new bad blocks tracker.
>> + ///
>> + /// Initializes an empty bad blocks tracker that can manage defective sectors
>> + /// on a block device. The tracker starts with no bad blocks recorded and
>> + /// allocates a single page for storing bad block entries.
>> + ///
>> + /// # Returns
>> + ///
>> + /// Returns a [`PinInit`] that can be used to initialize a [`BadBlocks`] instance.
>> + /// Initialization may fail with `ENOMEM` if memory allocation fails.
>> + ///
>> + /// # Examples
>> + ///
>> + /// ```rust
>> + /// # use kernel::block::badblocks::{BadBlocks, BlockStatus};
>> + /// # use kernel::prelude::*;
>> + ///
>
> Ditto. (Many times throughout file.)
>
>> + /// // Create and initialize a bad blocks tracker
>> + /// let bad_blocks = KBox::pin_init(BadBlocks::new(true), GFP_KERNEL)?;
>> + ///
>> + /// // The tracker is ready to use with no bad blocks initially
>> + /// match bad_blocks.check(0..100) {
>> + /// BlockStatus::None => pr_info!("No bad blocks found initially"),
>> + /// _ => unreachable!(),
>> + /// }
>> + /// # Ok::<(), kernel::error::Error>(())
>> + /// ```
>> + pub fn new(enable: bool) -> impl PinInit<Self, Error> {
>> + // INVARIANT: We initialize `self.blocks` below. If initialization fails, an error is
>> + // returned.
>> + try_pin_init!(Self {
>> + blocks <- Opaque::try_ffi_init(|slot| {
>> + // SAFETY: `slot` is a valid pointer to uninitialized memory
>> + // allocated by the Opaque type. `badblocks_init` is safe to
>> + // call with uninitialized memory.
>> + to_result(unsafe {bindings::badblocks_init(slot, if enable {1} else {0})})
>
> I think you can just cast the boolean to an integer.
Ok.
>
> Also, formatting here is off (but ignored by rustfmt due to macro.)
Will try to fix.
>
>> + /// Enables the bad blocks tracker if it was previously disabled.
>> + ///
>> + /// Attempts to enable bad block tracking by transitioning the tracker from
>> + /// a disabled state to an enabled state.
>> + ///
>> + /// # Behavior
>> + ///
>> + /// - If the tracker is disabled, it will be enabled.
>> + /// - If the tracker is already enabled, this operation has no effect.
>> + /// - The operation is atomic and thread-safe.
>> + ///
>> + /// # Usage
>> + ///
>> + /// Bad blocks trackers can be created in a disabled state and enabled later
>> + /// when needed. This is useful for conditional bad block tracking or for
>> + /// deferring activation until the device is fully initialized.
>> + ///
>> + /// # Examples
>> + ///
>> + /// ```rust
>> + /// # use kernel::block::badblocks::BadBlocks;
>> + /// # use kernel::prelude::*;
>> + ///
>> + /// // Create a disabled bad blocks tracker
>> + /// let bad_blocks = KBox::pin_init(BadBlocks::new(false), GFP_KERNEL)?;
>> + /// assert!(!bad_blocks.enabled());
>> + ///
>> + /// // Enable it when needed
>> + /// bad_blocks.enable();
>> + /// assert!(bad_blocks.enabled());
>> + ///
>> + /// // Subsequent enable calls have no effect
>> + /// bad_blocks.enable();
>> + /// assert!(bad_blocks.enabled());
>> + /// # Ok::<(), kernel::error::Error>(())
>> + /// ```
>> + pub fn enable(&self) {
>> + let _ = self.shift_ref().cmpxchg(-1, 0, ordering::Relaxed);
>
> Is there not a C function you can call here? It would be simpler that
> way. Surely drivers don't do this directly.
No, this is the canonical way [1].
[1] https://github.com/torvalds/linux/blob/ba3e43a9e601636f5edb54e259a74f96ca3b8fd8/drivers/block/null_blk/main.c#L563
>
>> + }
>> +
>> + /// Checks whether the bad blocks tracker is currently enabled.
>> + ///
>> + /// Returns `true` if bad block tracking is active, `false` if it is disabled.
>> + /// When disabled, the tracker will not perform bad block checks or operations.
>> + ///
>> + /// # Returns
>> + ///
>> + /// - `true` - Bad block tracking is enabled and operational
>> + /// - `false` - Bad block tracking is disabled
>
> You explain the meaning of return values twice here. Just drop the
> 'Returns' section.
Ok.
>
>> + /// # Thread Safety
>> + ///
>> + /// This method is thread-safe and uses atomic operations to check the
>> + /// tracker's state without requiring external synchronization.
>
> This is implicit from the signature of the function.
Will remove.
>
>> + pub fn set_good(&self, range: impl RangeBounds<u64>) -> Result {
>> + let range = Self::range(range);
>> + // SAFETY: By type invariant `self.blocks` is valid. The C function
>> + // `badblocks_clear` handles synchronization internally.
>> + unsafe {
>> + bindings::badblocks_clear(self.blocks.get(), range.start, range.end - range.start)
>> + }
>> + .then_some(())
>> + .ok_or(EINVAL)
>
> then_some() is quite obscure. I would recommend if/else here.
Ok.
Thanks,
Andreas
^ permalink raw reply
* Re: [PATCH 06/79] block: rust: add `Request` private data support
From: Andreas Hindborg @ 2026-06-03 10:40 UTC (permalink / raw)
To: Alice Ryhl
Cc: Boqun Feng, Jens Axboe, Miguel Ojeda, Gary Guo,
Björn Roy Baron, Benno Lossin, Trevor Gross,
Danilo Krummrich, FUJITA Tomonori, Frederic Weisbecker,
Lyude Paul, Thomas Gleixner, Anna-Maria Behnsen, John Stultz,
Stephen Boyd, Lorenzo Stoakes, Liam R. Howlett, linux-block,
rust-for-linux, linux-kernel, linux-mm
In-Reply-To: <abfXIqlT9qRhXssT@google.com>
Alice Ryhl <aliceryhl@google.com> writes:
> On Mon, Feb 16, 2026 at 12:34:53AM +0100, Andreas Hindborg wrote:
>> C block device drivers can attach private data to a `struct request`. This
>> data is stored next to the request structure and is part of the request
>> allocation set up during driver initialization.
>>
>> Expose this private request data area to Rust block device drivers.
>>
>> Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
>
> Overall looks ok. A few nits below:
>
>> ---
>> drivers/block/rnull/rnull.rs | 5 +++++
>> rust/kernel/block/mq.rs | 6 ++++++
>> rust/kernel/block/mq/operations.rs | 24 +++++++++++++++++++++++-
>> rust/kernel/block/mq/request.rs | 24 +++++++++++++++++++-----
>> rust/kernel/block/mq/tag_set.rs | 2 +-
>> 5 files changed, 54 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
>> index 6a7f660d31998..065639fc4f941 100644
>> --- a/drivers/block/rnull/rnull.rs
>> +++ b/drivers/block/rnull/rnull.rs
>> @@ -134,6 +134,11 @@ struct QueueData {
>> #[vtable]
>> impl Operations for NullBlkDevice {
>> type QueueData = KBox<QueueData>;
>> + type RequestData = ();
>> +
>> + fn new_request_data() -> impl PinInit<Self::RequestData> {
>> + pin_init::zeroed::<Self::RequestData>()
>
> Simpler to just return Ok(()) here.
Not sure why I picked zeroed. Will change.
>
>> + }
>>
>> #[inline(always)]
>> fn queue_rq(queue_data: &QueueData, rq: Owned<mq::Request<Self>>, _is_last: bool) -> Result {
>> diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
>> index b8ecd69abe980..a285b753ada88 100644
>> --- a/rust/kernel/block/mq.rs
>> +++ b/rust/kernel/block/mq.rs
>> @@ -69,8 +69,14 @@
>> //!
>> //! #[vtable]
>> //! impl Operations for MyBlkDevice {
>> +//! type RequestData = ();
>> //! type QueueData = ();
>> //!
>> +//! fn new_request_data(
>> +//! ) -> impl PinInit<()> {
>> +//! pin_init::zeroed::<()>()
>
> Simpler to just return Ok(()) here.
Will change.
>
>> +//! }
>> +//!
>> //! fn queue_rq(_queue_data: (), rq: Owned<Request<Self>>, _is_last: bool) -> Result {
>> //! rq.end_ok();
>> //! Ok(())
>> diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
>> index 3dea79d647ff7..cd37b939bbf30 100644
>> --- a/rust/kernel/block/mq/operations.rs
>> +++ b/rust/kernel/block/mq/operations.rs
>> @@ -13,6 +13,7 @@
>> types::{ForeignOwnable, Owned},
>> };
>> use core::{marker::PhantomData, ptr::NonNull};
>> +use pin_init::PinInit;
>>
>> type ForeignBorrowed<'a, T> = <T as ForeignOwnable>::Borrowed<'a>;
>>
>> @@ -28,10 +29,24 @@
>> /// [module level documentation]: kernel::block::mq
>> #[macros::vtable]
>> pub trait Operations: Sized {
>> + /// Data associated with a request. This data is located next to the request
>> + /// structure.
>> + ///
>> + /// To be able to handle accessing this data from interrupt context, this
>> + /// data must be `Sync`.
>> + ///
>> + /// The `RequestData` object is initialized when the requests are allocated
>> + /// during queue initialization, and it is are dropped when the requests are
>> + /// dropped during queue teardown.
>> + type RequestData: Sized + Sync;
>
> It was surprising to me that `request` is reused over many requests. I
> think it could be a bit more obvious in the wording.
I will add a bit of info in the `Request` type docs.
>
>> /// Data associated with the `struct request_queue` that is allocated for
>> /// the `GenDisk` associated with this `Operations` implementation.
>> type QueueData: ForeignOwnable;
>>
>> + /// Called by the kernel to get an initializer for a `Pin<&mut RequestData>`.
>> + fn new_request_data() -> impl PinInit<Self::RequestData>;
>> +
>> /// Called by the kernel to queue a request with the driver. If `is_last` is
>> /// `false`, the driver is allowed to defer committing the request.
>> fn queue_rq(
>> @@ -236,6 +251,13 @@ impl<T: Operations> OperationsVTable<T> {
>> // it is valid for writes.
>> unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(Refcount::new(0)) };
>>
>> + let initializer = T::new_request_data();
>> +
>> + // SAFETY: `pdu` is a valid pointer as established above. We do not
>> + // touch `pdu` if `__pinned_init` returns an error. We promise ot to
>
> typo
>
>
>> diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
>> index c3cf56d52beec..46481754b1335 100644
>> --- a/rust/kernel/block/mq/tag_set.rs
>> +++ b/rust/kernel/block/mq/tag_set.rs
>> @@ -41,7 +41,7 @@ pub fn new(
>> // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which
>> // all are allowed to be 0.
>> let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() };
>> - let tag_set: Result<_> = core::mem::size_of::<RequestDataWrapper>()
>> + let tag_set: Result<_> = core::mem::size_of::<RequestDataWrapper<T>>()
>
> size_of in prelude.
Fixed.
Thanks,
Andreas
^ permalink raw reply
* Re: [PATCH 09/79] block: rust: introduce `kernel::block::bio` module
From: Andreas Hindborg @ 2026-06-03 11:29 UTC (permalink / raw)
To: Alice Ryhl
Cc: Boqun Feng, Jens Axboe, Miguel Ojeda, Gary Guo,
Björn Roy Baron, Benno Lossin, Trevor Gross,
Danilo Krummrich, FUJITA Tomonori, Frederic Weisbecker,
Lyude Paul, Thomas Gleixner, Anna-Maria Behnsen, John Stultz,
Stephen Boyd, Lorenzo Stoakes, Liam R. Howlett, linux-block,
rust-for-linux, linux-kernel, linux-mm
In-Reply-To: <abfbfV6-FXnVT9Ud@google.com>
Alice Ryhl <aliceryhl@google.com> writes:
> On Mon, Feb 16, 2026 at 12:34:56AM +0100, Andreas Hindborg wrote:
>> Add Rust abstractions for working with `struct bio`, the core IO command
>> descriptor for the block layer.
>>
>> The `Bio` type wraps `struct bio` and provides safe access to the IO
>> vector describing the data buffers associated with the IO command. The
>> data buffers are represented as a vector of `Segment`s, where each
>> segment is a contiguous region of physical memory backed by `Page`.
>>
>> The `BioSegmentIterator` provides iteration over segments in a single
>> bio, while `BioIterator` allows traversing a chain of bios. The
>> `Segment` type offers methods for copying data to and from pages, as
>> well as zeroing page contents, which are the fundamental operations
>> needed by block device drivers to process IO requests.
>>
>> The `Request` type is extended with methods to access the bio chain
>> associated with a request, allowing drivers to iterate over all data
>> buffers that need to be processed.
>>
>> Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
>> ---
>> rust/helpers/blk.c | 8 +
>> rust/kernel/block.rs | 1 +
>> rust/kernel/block/bio.rs | 143 +++++++++++++++
>> rust/kernel/block/bio/vec.rs | 389 ++++++++++++++++++++++++++++++++++++++++
>> rust/kernel/block/mq/request.rs | 46 +++++
>> rust/kernel/lib.rs | 2 +
>> rust/kernel/page.rs | 2 +-
>> 7 files changed, 590 insertions(+), 1 deletion(-)
>>
>> diff --git a/rust/helpers/blk.c b/rust/helpers/blk.c
>> index cc9f4e6a2d234..53beba8c7782d 100644
>> --- a/rust/helpers/blk.c
>> +++ b/rust/helpers/blk.c
>> @@ -1,5 +1,6 @@
>> // SPDX-License-Identifier: GPL-2.0
>>
>> +#include <linux/bio.h>
>> #include <linux/blk-mq.h>
>> #include <linux/blkdev.h>
>>
>> @@ -12,3 +13,10 @@ struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu)
>> {
>> return blk_mq_rq_from_pdu(pdu);
>> }
>> +
>> +void rust_helper_bio_advance_iter_single(const struct bio *bio,
>> + struct bvec_iter *iter,
>> + unsigned int bytes)
>> +{
>> + bio_advance_iter_single(bio, iter, bytes);
>> +}
>
> __rust_helper
Thanks.
>
>> diff --git a/rust/kernel/block/bio.rs b/rust/kernel/block/bio.rs
>> new file mode 100644
>> index 0000000000000..94062ea5281e6
>> --- /dev/null
>> +++ b/rust/kernel/block/bio.rs
>> @@ -0,0 +1,143 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! Types for working with the bio layer.
>> +//!
>> +//! C header: [`include/linux/blk_types.h`](../../include/linux/blk_types.h)
>
> srctree/
Ok.
>
>> +/// A block device IO descriptor (`struct bio`).
>> +///
>> +/// A `Bio` is the main unit of IO for the block layer. It describes an IO command and associated
>> +/// data buffers.
>> +///
>> +/// The data buffers associated with a `Bio` are represented by a vector of [`Segment`]s. These
>> +/// segments represent physically contiguous regions of memory. The memory is represented by
>> +/// [`Page`] descriptors internally.
>> +///
>> +/// The vector of [`Segment`]s can be iterated by obtaining a [`SegmentIterator`].
>> +///
>> +/// # Invariants
>> +///
>> +/// Instances of this type is always reference counted. A call to
>> +/// `bindings::bio_get()` ensures that the instance is valid for read at least
>> +/// until a matching call to `bindings :bio_put()`.
>
> Refcounted? None of these methods are called anywhere, and you do not
> implement AlwaysRefcounted.
This is stale info, I will remove it.
>
>> +#[repr(transparent)]
>> +pub struct Bio(Opaque<bindings::bio>);
>> +
>> +impl Bio {
>> + /// Returns an iterator over segments in this `Bio`. Does not consider
>> + /// segments of other bios in this bio chain.
>> + #[inline(always)]
>> + pub fn segment_iter(&mut self) -> BioSegmentIterator<'_> {
>
> Not `self: Pin<&mut Self>` here?
It definitely must be pinned, thanks.
>
>> + /// Create an instance of `Bio` from a raw pointer.
>> + ///
>> + /// # Safety
>> + ///
>> + /// Caller must ensure that the `ptr` is valid for use as a reference to
>> + /// `Bio` for the duration of `'a`.
>> + #[inline(always)]
>> + pub(crate) unsafe fn from_raw<'a>(ptr: *mut bindings::bio) -> Option<&'a Self> {
>> + Some(
>> + // SAFETY: by the safety requirement of this funciton, `ptr` is
>> + // valid for read for the duration of the returned lifetime
>> + unsafe { &*NonNull::new(ptr)?.as_ptr().cast::<Bio>() },
>> + )
>> + }
>> +
>> + /// Create an instance of `Bio` from a raw pointer.
>> + ///
>> + /// # Safety
>> + ///
>> + /// Caller must ensure that the `ptr` is valid for use as a unique reference
>> + /// to `Bio` for the duration of `'a`.
>> + #[inline(always)]
>> + pub(crate) unsafe fn from_raw_mut<'a>(ptr: *mut bindings::bio) -> Option<&'a mut Self> {
>> + Some(
>> + // SAFETY: by the safety requirement of this funciton, `ptr` is
>> + // valid for read for the duration of the returned lifetime
>> + unsafe { &mut *NonNull::new(ptr)?.as_ptr().cast::<Bio>() },
>
> Why the Option? I imagine every caller has a non-null pointert
It felt more streamlined to have the check here than at the call site:
/// Get a mutable reference to the first [`Bio`] in this request.
#[inline(always)]
pub fn bio_mut(&mut self) -> Option<&mut Bio> {
// SAFETY: By type invariant of `Self`, `self.0` is valid and the deref
// is safe.
let ptr = unsafe { (*self.0 .0.get()).bio };
// SAFETY: By C API contract, if `bio` is not null it will have a
// positive refcount at least for the duration of the lifetime of
// `&self`.
unsafe { Bio::from_raw_mut(ptr) }
}
Yes, this also needs to take Pin<&mut Self>.
>
>> + )
>> + }
>> +}
>> +
>> +impl core::fmt::Display for Bio {
>
> We have our own fmt trait now, right?
Will switch to the kernel one.
>
>> + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
>> + write!(
>> + f,
>> + "Bio({:?}, vcnt: {}, idx: {}, size: 0x{:x}, completed: 0x{:x})",
>> + self.0.get(),
>> + self.io_vec_count(),
>> + self.raw_iter().bi_idx,
>> + self.raw_iter().bi_size,
>> + self.raw_iter().bi_bvec_done
>
> This reads the entire `bi_iter` field three separate times. A local
> variable may be a good idea.
Ok.
>
>> +/// An iterator over `Segment`
>> +///
>> +/// # Invariants
>> +///
>> +/// If `iter.bi_size` > 0, `iter` must always index a valid `bio_vec` in `bio.io_vec()`.
>> +pub struct BioSegmentIterator<'a> {
>> + bio: &'a mut Bio,
>> + iter: bindings::bvec_iter,
>> +}
>> +
>> +impl<'a> BioSegmentIterator<'a> {
>> + /// Creeate a new segemnt iterator for iterating the segments of `bio`. The
>
> typo
Thanks.
>
>> +impl<'a> core::iter::Iterator for BioSegmentIterator<'a> {
>> + type Item = Segment<'a>;
>> +
>> + #[inline(always)]
>> + fn next(&mut self) -> Option<Self::Item> {
>> + if self.iter.bi_size == 0 {
>> + return None;
>> + }
>> +
>> + // SAFETY: We checked that `self.iter.bi_size` > 0 above.
>> + let bio_vec_ret = unsafe { self.io_vec() };
>> +
>> + // SAFETY: By existence of reference `&bio`, `bio.0` contains a valid
>> + // `struct bio`. By type invariant of `BioSegmentItarator` `self.iter`
>> + // indexes into a valid `bio_vec` entry. By C API contracit, `bv_len`
>> + // does not exceed the size of the bio.
>> + unsafe {
>> + bindings::bio_advance_iter_single(
>> + self.bio.0.get(),
>> + core::ptr::from_mut(&mut self.iter),
>
> Creating this BioSegmentItarator copies the bvec_iter from the Bio, and
> then here you modify the copy. Is that the intent? Is the C type such
> that copying it is always okay?
Yes. I can see if I can document this better.
> Also, is the C type such that moves are ok? It's playsible that the
> answer is yes - the same applies in rust/kernel/iov.rs but it could be
> clearer in e.g. "Invariants" that this is the case.
Yes, it is so. I'll update docs.
> Nit: core::ptr::from_mut(&mut self.iter) -> &raw mut self.iter
Ok.
>
>> + /// Get a mutable reference to the first [`Bio`] in this request.
>> + #[inline(always)]
>> + pub fn bio_mut(&mut self) -> Option<&mut Bio> {
>> + // SAFETY: By type invariant of `Self`, `self.0` is valid and the deref
>> + // is safe.
>> + let ptr = unsafe { (*self.0.get()).bio };
>> + // SAFETY: By C API contract, if `bio` is not null it will have a
>> + // positive refcount at least for the duration of the lifetime of
>> + // `&self`.
>> + unsafe { Bio::from_raw_mut(ptr) }
>
> Surely &mut requires refcount == 1, not just positive refcount?
No, that is not how the C refcount works. Upper layers of the IO stack
will hold refcounts on the bio, even though lower layers are allowed to
mutate the data buffers of the bio.
Best regards,
Andreas Hindborg
^ permalink raw reply
* Re: [PATCH 10/79] block: rust: add `command` getter to `Request`
From: Andreas Hindborg @ 2026-06-03 11:50 UTC (permalink / raw)
To: Alice Ryhl
Cc: Boqun Feng, Jens Axboe, Miguel Ojeda, Gary Guo,
Björn Roy Baron, Benno Lossin, Trevor Gross,
Danilo Krummrich, FUJITA Tomonori, Frederic Weisbecker,
Lyude Paul, Thomas Gleixner, Anna-Maria Behnsen, John Stultz,
Stephen Boyd, Lorenzo Stoakes, Liam R. Howlett, linux-block,
rust-for-linux, linux-kernel, linux-mm
In-Reply-To: <abfbuJVbPtIEyweC@google.com>
Alice Ryhl <aliceryhl@google.com> writes:
> On Mon, Feb 16, 2026 at 12:34:57AM +0100, Andreas Hindborg wrote:
>> Add a method to extract the command operation code from a request. The
>> command is obtained by masking the lower bits of `cmd_flags` as defined by
>> `REQ_OP_BITS`. This allows Rust block drivers to determine the type of
>> operation being requested.
>>
>> Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
>
> With the nit below fixed:
> Reviewed-by: Alice Ryhl <aliceryhl@google.com>
>
>> rust/kernel/block/mq/request.rs | 6 ++++++
>> 1 file changed, 6 insertions(+)
>>
>> diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
>> index b49197a0c66d7..0dd329ae93dfc 100644
>> --- a/rust/kernel/block/mq/request.rs
>> +++ b/rust/kernel/block/mq/request.rs
>> @@ -78,6 +78,12 @@ pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef<Self> {
>> unsafe { ARef::from_raw(NonNull::new_unchecked(ptr.cast())) }
>> }
>>
>> + /// Get the command identifier for the request
>> + pub fn command(&self) -> u32 {
>> + // SAFETY: By C API contract and type invariant, `cmd_flags` is valid for read
>> + unsafe { (*self.0.get()).cmd_flags & ((1 << bindings::REQ_OP_BITS) - 1) }
>
> Nit: scope of unsafe
>
> unsafe { (*self.0.get()).cmd_flags } & ((1 << bindings::REQ_OP_BITS) - 1)
The `&` is parsed as reference operator with this change. But we can do
this:
use core::ops::BitAnd;
// SAFETY: By C API contract and type invariant, `cmd_flags` is valid for read
unsafe { (*self.0.get()).cmd_flags }.bitand((1u32 << bindings::REQ_OP_BITS) - 1)
Best regards,
Andreas Hindborg
^ permalink raw reply
* [PATCH v3] block: assign caller-specific lockdep class to disk->open_mutex
From: Tetsuo Handa @ 2026-06-03 11:54 UTC (permalink / raw)
To: Christoph Hellwig, Bart Van Assche, Jens Axboe
Cc: linux-block, LKML, Andrew Morton, Ming Lei, Damien Le Moal,
Qu Wenruo, Hillf Danton, Miguel Ojeda
In-Reply-To: <420f723a-8168-4f56-b84a-2a36ecd87fea@I-love.SAKURA.ne.jp>
The block core currently allocates a single monolithic lockdep key for
disk->open_mutex across all callers. This single key conflates locking
hierarchies between independent block streams. For example, if a stacked
driver like loop flushes its internal workqueues inside lo_release() while
holding its own open_mutex, lockdep views this as a potential ABBA deadlock
against the underlying storage stack, leading to numerous circular
dependency splats.
To structurally reduce false positives, this patch splits the global
monolithic lock class into distinct, per-caller instances during disk
allocation. This is done by replacing "struct lock_class_key" with
"struct gendisk_lkclass", which contains two instances of
"struct lock_class_key" for the legacy "(bio completion)" map and
disk->open_mutex respectively.
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
Changes in v3:
- Adjusted Rust part for safe pointer passing, pointed out by sashiko
( https://sashiko.dev/#/patchset/420f723a-8168-4f56-b84a-2a36ecd87fea%40I-love.SAKURA.ne.jp ) .
Changes in v2:
- Replaced a two-element array with a struct with two named members, suggested by Bart Van Assche
( https://lore.kernel.org/all/4cf7ecc7-932c-4589-9d0f-3e025e83e27c@acm.org/ ).
- Added changes needed by Rust block layer bindings and rnull module, pointed out by sashiko
( https://sashiko.dev/#/patchset/147ed056-03d9-4214-b925-0f10fc00cf27%40I-love.SAKURA.ne.jp ).
Testing result of v1:
- I kept v1 patch in linux-next for several more days, but result was that
some of circular dependency splats which I thought this change succeeded to
eliminate are still getting reported. That is, we need to determine whether
we should make this change without example syzbot reports that demonstrates
difference. But in general, keeping locking chains simpler and shorter
should be a good change.
Acknowledgment:
Since I have no experience with Rust, changes needed by Rust block layer
bindings and rnull module are made based on conversation with the Gemini
AI collaborator.
block/blk-mq.c | 4 ++--
block/blk.h | 2 +-
block/genhd.c | 8 +++----
drivers/block/rnull/rnull.rs | 2 +-
drivers/scsi/sd.c | 2 +-
drivers/scsi/sr.c | 2 +-
include/linux/blk-mq.h | 6 ++---
include/linux/blkdev.h | 9 +++++--
rust/kernel/block/mq.rs | 2 +-
rust/kernel/block/mq/gen_disk.rs | 41 ++++++++++++++++++++++++++++++--
10 files changed, 60 insertions(+), 18 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a24175441380..5203e8cc6a28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4492,7 +4492,7 @@ EXPORT_SYMBOL(blk_mq_destroy_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
@@ -4513,7 +4513,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
EXPORT_SYMBOL(__blk_mq_alloc_disk);
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct gendisk *disk;
diff --git a/block/blk.h b/block/blk.h
index b998a7761faf..611bcd655357 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -614,7 +614,7 @@ void drop_partition(struct block_device *part);
void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e9..8f4a3d8ca15e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1444,7 +1444,7 @@ dev_t part_devt(struct gendisk *disk, u8 partno)
}
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct gendisk *disk;
@@ -1467,7 +1467,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
goto out_free_bdi;
disk->node_id = node_id;
- mutex_init(&disk->open_mutex);
+ mutex_init_with_key(&disk->open_mutex, &lkclass->open_mutex_lkclass);
xa_init(&disk->part_tbl);
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
@@ -1482,7 +1482,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
device_initialize(disk_to_dev(disk));
inc_diskseq(disk);
q->disk = disk;
- lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
+ lockdep_init_map(&disk->lockdep_map, "(bio completion)", &lkclass->bio_lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
@@ -1506,7 +1506,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
}
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
- struct lock_class_key *lkclass)
+ struct gendisk_lkclass *lkclass)
{
struct queue_limits default_lim = { };
struct request_queue *q;
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
index 0ca8715febe8..476a8910c432 100644
--- a/drivers/block/rnull/rnull.rs
+++ b/drivers/block/rnull/rnull.rs
@@ -61,7 +61,7 @@ fn new(
.logical_block_size(block_size)?
.physical_block_size(block_size)?
.rotational(rotational)
- .build(fmt!("{}", name.to_str()?), tagset, queue_data)
+ .build(fmt!("{}", name.to_str()?), tagset, queue_data, kernel::my_gendisk_lkclass!())
}
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 599e75f33334..63fe8c86606a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -112,7 +112,7 @@ static DEFINE_MUTEX(sd_mutex_lock);
static mempool_t *sd_page_pool;
static mempool_t *sd_large_page_pool;
static atomic_t sd_large_page_pool_users = ATOMIC_INIT(0);
-static struct lock_class_key sd_bio_compl_lkclass;
+static struct gendisk_lkclass sd_bio_compl_lkclass;
static const char *sd_cache_types[] = {
"write through", "none", "write back",
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index c36c54ecd354..734567ae0e43 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -106,7 +106,7 @@ static struct scsi_driver sr_template = {
static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG];
static DEFINE_SPINLOCK(sr_index_lock);
-static struct lock_class_key sr_bio_compl_lkclass;
+static struct gendisk_lkclass sr_bio_compl_lkclass;
static int sr_open(struct cdrom_device_info *, int);
static void sr_release(struct cdrom_device_info *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..5aa17e82c3ba 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -726,15 +726,15 @@ enum {
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
#define blk_mq_alloc_disk(set, lim, queuedata) \
({ \
- static struct lock_class_key __key; \
+ static struct gendisk_lkclass __key; \
\
__blk_mq_alloc_disk(set, lim, queuedata, &__key); \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..28b0aee6b3ba 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -49,6 +49,11 @@ extern const struct device_type disk_type;
extern const struct device_type part_type;
extern const struct class block_class;
+struct gendisk_lkclass {
+ struct lock_class_key bio_lkclass;
+ struct lock_class_key open_mutex_lkclass;
+};
+
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
@@ -974,7 +979,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate);
void put_disk(struct gendisk *disk);
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
- struct lock_class_key *lkclass);
+ struct gendisk_lkclass *lkclass);
/**
* blk_alloc_disk - allocate a gendisk structure
@@ -990,7 +995,7 @@ struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
*/
#define blk_alloc_disk(lim, node_id) \
({ \
- static struct lock_class_key __key; \
+ static struct gendisk_lkclass __key; \
\
__blk_alloc_disk(lim, node_id, &__key); \
})
diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
index 1fd0d54dd549..10f22b200567 100644
--- a/rust/kernel/block/mq.rs
+++ b/rust/kernel/block/mq.rs
@@ -88,7 +88,7 @@
//! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
//! let mut disk = gen_disk::GenDiskBuilder::new()
//! .capacity_sectors(4096)
-//! .build(fmt!("myblk"), tagset, ())?;
+//! .build(fmt!("myblk"), tagset, (), kernel::my_gendisk_lkclass!())?;
//!
//! # Ok::<(), kernel::error::Error>(())
//! ```
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 912cb805caf5..7e669ca5c032 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -11,7 +11,6 @@
error::{self, from_err_ptr, Result},
fmt::{self, Write},
prelude::*,
- static_lock_class,
str::NullTerminatedFormatter,
sync::Arc,
types::{ForeignOwnable, ScopeGuard},
@@ -38,6 +37,43 @@ fn default() -> Self {
}
}
+/// A wrapper type for safely passing "struct gendisk_lkclass" argument.
+///
+/// This type can only be instantiated via the [`my_gendisk_lkclass!`] macro.
+pub struct GenDiskLockClass(pub(crate) *mut bindings::gendisk_lkclass);
+
+impl GenDiskLockClass {
+ /// Retrieve the underlying raw pointer.
+ pub(crate) fn as_ptr(&self) -> *mut bindings::gendisk_lkclass {
+ self.0
+ }
+}
+
+#[doc(hidden)]
+pub mod __internal {
+ use super::*;
+
+ /// Internal constructor used ONLY by the `my_gendisk_lkclass!` macro.
+ ///
+ /// SAFETY: `ptr` must point to a valid static `gendisk_lkclass` instance.
+ pub const unsafe fn new_lock_class(ptr: *mut bindings::gendisk_lkclass) -> GenDiskLockClass {
+ GenDiskLockClass(ptr)
+ }
+}
+
+/// Helper macro to generate a unique caller-local static lock class struct
+#[macro_export]
+macro_rules! my_gendisk_lkclass {
+ () => {{
+ static mut LKCLASS: $crate::bindings::gendisk_lkclass = $crate::bindings::gendisk_lkclass {
+ bio_lkclass: const { unsafe { ::core::mem::zeroed() } },
+ open_mutex_lkclass: const { unsafe { ::core::mem::zeroed() } },
+ };
+
+ unsafe { $crate::block::mq::gen_disk::__internal::new_lock_class(&raw mut LKCLASS) }
+ }};
+}
+
impl GenDiskBuilder {
/// Create a new instance.
pub fn new() -> Self {
@@ -100,6 +136,7 @@ pub fn build<T: Operations>(
name: fmt::Arguments<'_>,
tagset: Arc<TagSet<T>>,
queue_data: T::QueueData,
+ lkclass: GenDiskLockClass,
) -> Result<GenDisk<T>> {
let data = queue_data.into_foreign();
let recover_data = ScopeGuard::new(|| {
@@ -121,7 +158,7 @@ pub fn build<T: Operations>(
tagset.raw_tag_set(),
&mut lim,
data,
- static_lock_class!().as_ptr(),
+ lkclass.as_ptr(),
)
})?;
--
2.54.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox