* [PATCH] block: reject block device inodes with i_rdev == 0 in lookup_bdev()
From: Yun Zhou @ 2026-07-03 6:55 UTC (permalink / raw)
To: axboe, brauner; +Cc: linux-block, linux-kernel, yun.zhou
lookup_bdev() blindly returns inode->i_rdev without validating it.
When a FUSE filesystem exposes a root inode with S_IFBLK mode but
i_rdev == 0 (via rootmode=060000), any subsequent mount attempt using
that path as a block device source propagates dev_t 0 into the
superblock machinery. After commit 9ee5f161a4db ("fs: maintain a
global device-to-superblock table") this triggers a WARNING in
super_dev_register().
Reject i_rdev == 0 early with -ENODEV since no real block device
driver registers major 0.
Reported-by: syzbot+72fe3ea5814121fbc76e@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=72fe3ea5814121fbc76e
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
block/bdev.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/block/bdev.c b/block/bdev.c
index 28b0d40c362f..797d7f0ef609 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1278,6 +1278,18 @@ int lookup_bdev(const char *pathname, dev_t *dev)
if (!may_open_dev(&path))
goto out_path_put;
+ /*
+ * Reject a block device inode with i_rdev == 0. A dev_t of 0 is
+ * never valid for a block device: no real block device driver
+ * registers major 0. Fake block device inodes (e.g. fuse with
+ * rootmode=S_IFBLK) can expose i_rdev == 0, and letting that
+ * propagate would confuse superblock lookup and trigger warnings
+ * in the device-to-superblock table (super_dev_register).
+ */
+ error = -ENODEV;
+ if (!inode->i_rdev)
+ goto out_path_put;
+
*dev = inode->i_rdev;
error = 0;
out_path_put:
--
2.43.0
^ permalink raw reply related
* [syzbot] [block?] INFO: task hung in read_cache_folio (6)
From: syzbot @ 2026-07-03 7:25 UTC (permalink / raw)
To: axboe, linux-block, linux-kernel, syzkaller-bugs
Hello,
syzbot found the following issue on:
HEAD commit: 92e3f6ef4ffb Merge branch 'for-next/core' into for-kernelci
git tree: git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-kernelci
console output: https://syzkaller.appspot.com/x/log.txt?x=142300d6580000
kernel config: https://syzkaller.appspot.com/x/.config?x=a4a42e870a0b0ae0
dashboard link: https://syzkaller.appspot.com/bug?extid=9db0864859224b833108
compiler: Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6
userspace arch: arm64
Unfortunately, I don't have any reproducer for this issue yet.
Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/75ce25b4a6ef/disk-92e3f6ef.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/14bda28d7d38/vmlinux-92e3f6ef.xz
kernel image: https://storage.googleapis.com/syzbot-assets/247283a18992/Image-92e3f6ef.gz.xz
IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+9db0864859224b833108@syzkaller.appspotmail.com
INFO: task udevd:6979 blocked
INFO: task udevd:6979 blocked in I/O wait for more than 143 seconds.
Tainted: G L syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:udevd state:D stack:0 pid:6979 tgid:6979 ppid:4339 task_flags:0x400140 flags:0x00800011
Call trace:
__switch_to+0x2b0/0x6e0 arch/arm64/kernel/process.c:810 (T)
context_switch kernel/sched/core.c:5388 [inline]
__schedule+0x1c04/0x2db8 kernel/sched/core.c:7189
__schedule_loop kernel/sched/core.c:7268 [inline]
schedule+0x13c/0x20c kernel/sched/core.c:7283
io_schedule+0x1c/0xe0 kernel/sched/core.c:8110
folio_wait_bit_common+0x6b8/0xa30 mm/filemap.c:1324
folio_put_wait_locked mm/filemap.c:1493 [inline]
do_read_cache_folio+0x23c/0x5a8 mm/filemap.c:4089
read_cache_folio+0x68/0x88 mm/filemap.c:4139
read_mapping_folio include/linux/pagemap.h:1017 [inline]
read_part_sector+0xcc/0x708 block/partitions/core.c:724
adfspart_check_POWERTEC+0x94/0x5b8 block/partitions/acorn.c:450
check_partition block/partitions/core.c:143 [inline]
blk_add_partitions block/partitions/core.c:591 [inline]
bdev_disk_changed+0x6c4/0x11ec block/partitions/core.c:695
blkdev_get_whole+0x15c/0x240 block/bdev.c:756
bdev_open+0x2b4/0x880 block/bdev.c:965
blkdev_open+0x2d4/0x408 block/fops.c:697
do_dentry_open+0x5c4/0xfb8 fs/open.c:947
vfs_open+0x44/0x2dc fs/open.c:1079
do_open fs/namei.c:4699 [inline]
path_openat+0x22d4/0x2b88 fs/namei.c:4858
do_file_open+0x1c8/0x2e8 fs/namei.c:4887
do_sys_openat2+0x114/0x1e8 fs/open.c:1364
do_sys_open+0xac/0xdc fs/open.c:1370
__do_sys_openat fs/open.c:1386 [inline]
__se_sys_openat fs/open.c:1381 [inline]
__arm64_sys_openat+0xa0/0xbc fs/open.c:1381
__invoke_syscall arch/arm64/kernel/syscall.c:35 [inline]
invoke_syscall+0x98/0x244 arch/arm64/kernel/syscall.c:49
el0_svc_common+0xec/0x23c arch/arm64/kernel/syscall.c:121
do_el0_svc+0x4c/0x5c arch/arm64/kernel/syscall.c:140
el0_svc+0x64/0x260 arch/arm64/kernel/entry-common.c:736
el0t_64_sync_handler+0x48/0x148 arch/arm64/kernel/entry-common.c:755
el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:594
Showing all locks held in the system:
1 lock held by khungtaskd/31:
#0: ffff800088bf70c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline]
#0: ffff800088bf70c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
#0: ffff800088bf70c0 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x4c/0x188 kernel/locking/lockdep.c:6775
2 locks held by pr/ttyAMA-1/41:
2 locks held by getty/4488:
#0: ffff0000d43c90a0 (&tty->ldisc_sem){++++}-{0:0}, at: ldsem_down_read+0x3c/0x4c drivers/tty/tty_ldsem.c:340
#1: ffff8000923ab2e8 (&ldata->atomic_read_lock){+.+.}-{4:4}, at: n_tty_read+0x348/0xf70 drivers/tty/n_tty.c:2211
1 lock held by udevd/6979:
#0: ffff0000c7f2a350 (&disk->open_mutex){+.+.}-{4:4}, at: bdev_open+0xc4/0x880 block/bdev.c:953
=============================================
---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title
If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)
If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report
If you want to undo deduplication, reply with:
#syz undup
^ permalink raw reply
* Re: [PATCH v2 17/18] iomap: pass iomap_next_fn directly instead of struct iomap_ops
From: Christian Brauner @ 2026-07-03 10:37 UTC (permalink / raw)
To: Joanne Koong
Cc: Darrick J. Wong, Christoph Hellwig, brauner, willy, hsiangkao,
linux-fsdevel, linux-xfs, Jens Axboe, Chris Mason, David Sterba,
Alexander Viro, Jan Kara, Dan Williams, Gao Xiang, Chao Yu,
Yue Hu, Jeffle Xu, Sandeep Dhavale, Hongbo Li, Chunhai Guo,
Namjae Jeon, Sungjong Seo, Yuezhang Mo, Theodore Ts'o,
Andreas Dilger, Baokun Li, Ojaswin Mujoo, Ritesh Harjani (IBM),
Zhang Yi, Jaegeuk Kim, Miklos Szeredi, Andreas Gruenbacher,
Mikulas Patocka, Hyunchul Lee, Konstantin Komarov,
Carlos Maiolino, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
open list:BLOCK LAYER, open list, open list:BTRFS FILE SYSTEM,
open list:FILESYSTEM DIRECT ACCESS (DAX),
open list:EROFS FILE SYSTEM, open list:EXT2 FILE SYSTEM,
open list:F2FS FILE SYSTEM, open list:FUSE FILESYSTEM [CORE],
open list:GFS2 FILE SYSTEM, open list:NTFS3 FILESYSTEM
In-Reply-To: <CAJnrk1b8j5WHtbHOWNXc4=QBFOxde1f2QxTOeui7Ta8O-xWcTA@mail.gmail.com>
On 2026-07-02 18:47 -0700, Joanne Koong wrote:
> On Thu, Jul 2, 2026 at 9:51 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Thu, Jul 02, 2026 at 04:07:05PM +0200, Christoph Hellwig wrote:
> > > Looks good:
> > >
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > >
> > > In terms of merge logistics, I wonder if we should delay this and
> > > the previous patch to the next merge window so that we can minimize the
> > > cross-subsystem merge pain with more file system iomap conversion.
> > > If none of them actually happen until rc6 or so, orif the merges aren't
> > > painful we could still pick them up late in the merge window.
> >
> > I'd say everything but this patch should go in during the merge window
> > for 7.3, along with clear instructions to brauner/torvalds to expect
> > this patch to appear right before 7.3-rc1 gets tagged, to clean up all
> > the other changes that come in.
>
> Just to clarify, did you mean this patch and the previous one? If i'm
> interpreting Christoph's concern correctly, I think he's worried about
> other filesystems converting to iomap using the ->iomap_begin() /
> ->iomap_end() functions still? That sounds like a good plan to me, for
> v3 I'll submit everything but this patch and the last one and then
Ok, so we'll do the prep for vfs-7.3.iomap (aka to be merged in the
v7.3-rc1 cycle)...
> submit these patches (and any cleanup ones that become necessary) to
> Christian right before 7.3-rc1 gets tagged (which as I understand it,
> is when the merge window is about to close).
and merge these _after_ v7.3-rc1 has been tagged...
^ permalink raw reply
* [PATCH v8 0/9] Support for block device NVMEM providers
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski, Krzysztof Kozlowski,
Piotr Kwapulinski, Konrad Dybcio
On embedded devices, it is common for factory provisioning to store
device-specific information, such as Ethernet or WiFi MAC addresses,
in a dedicated area of an eMMC partition. This avoids the need for
and additional EEPROM/OTP and leverages the persistence of eMMC.
One example is the Arduino UNO-Q, where the WiFi MAC address and the
Bluetooth Device address are stored in the eMMC Boot1 partition.
Until now, accessing this information required a custom bootloader
to read the data and inject it into the Device Tree before handing
control over to the kernel. This approach is fragile and leads to
device-specific workarounds.
Rather than adding a new NVMEM provider specifically to the eMMC
subsystem, the new support operates at the block layer, allowing any
block device to behave like other non-volatile memories such as EEPROM
or OTP.
This series builds on earlier work by Daniel Golle that enables block
devices to act as NVMEM providers:
https://lore.kernel.org/all/6061aa4201030b9bb2f8d03ef32a564fdb786ed1.1709667858.git.daniel@makrotopia.org/
It also introduces an NVMEM layout description for the Arduino UNO-Q,
allowing device-specific data stored in the eMMC Boot1 partition to
be accessed in a standard way.
WiFi and Ethernet already support retrieving MAC addresses from NVMEM.
Bluetooth requires similar support, which is also addressed.
Note that this is currently limited to MMC-backed block devices, as
only the MMC core associates a firmware node with the block device
(add_disk_fwnode). This can be easily extended in the future to
support additional block drivers.
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
Changes in v8:
- block: Fix bdev->bd_nvmem on registering failure (sashiko)
- net: of_net: of_get_nvmem_eui48() now also rejects all-ones cells (sashiko)
- Bluetooth: mgmt: account for HCI_QUIRK_USE_BDADDR_NVMEM in
is_configured() and get_missing_options() (sashiko)
- Reword blk-nvmem KCONFIG help text to satisfy checkpatch
- Link to v7: https://lore.kernel.org/r/20260701-block-as-nvmem-v7-0-3fe8205ef0a8@oss.qualcomm.com
Changes in v7:
- Rework bindings/dts so that the eMMC boot partition can be a nvmem fixed-layout
and not a child of fixed-partition. (Rob)
- Add Support for fixed-layout as the nvmem device node itself
- Remove "block: partitions: of: Skip child nodes without reg property"
This is no more required in this series and will be submitted separately (Rob)
- Add missing linux/cleanup.h and linux/device.h includes (Bartosz)
- simplify nvmem_register() error path using dev_err_probe() (Bartosz)
- nvmem_device forward declaration to blk_types.h (Bartosz)
- Add hci_dev_get_bd_addr_from_nvmem() kernel-doc for return value (Piotr)
- Link to v6: https://lore.kernel.org/r/20260629-block-as-nvmem-v6-0-f02513dcd46d@oss.qualcomm.com
Changes in v6:
- blk_nvmem_add() returns int, error properly propagated (Bartosz)
- Redundant if (bdev->bd_nvmem) guard removed in blk_nvmem_del() (Bartosz)
- Size guard changed from UINT_MAX → INT_MAX to avoid signed overflow in config.size (sashiko)
- BLK_OPEN_RESTRICT_WRITES removed from blk_nvmem_reg_read() (sashiko)
- Link to v5: https://lore.kernel.org/r/20260612-block-as-nvmem-v5-0-95e0b30fff90@oss.qualcomm.com
Changes in v5:
- Fixed ath10k binding issue + extended commit message (Krzysztof)
- Moved blk-nvmem handling to block core instead of a class_interface
This allows correct/robust integration with block device life cycle (Bartosz).
- block: partitions: of: Skip child nodes without reg property (sashiko)
- Link to v4: https://lore.kernel.org/r/20260609-block-as-nvmem-v4-0-45712e6b22c6@oss.qualcomm.com
Changes in v4:
- Fix squash issue (dts commit incorrectly squashed) (Konrad)
- Use devres for nvmem resources (Bartosz)
- use __free() destructor helper when possible (Bartosz)
- Fix value return checking for bdev_file_open_by_dev
- Link to v3: https://lore.kernel.org/r/20260608-block-as-nvmem-v3-0-82681f50aa35@oss.qualcomm.com
Changes in v3:
- Fixed missing 'fixed-partitions' compatible in partition (Rob)
- Fixed clashing nvmem cells, document calibration along mac (Sashiko)
- Remove workaround to handle dangling nvmem references after
unregistering, this is a generic nvmem framework issue handled
in Bartosz's series:
https://lore.kernel.org/all/20260429-nvmem-unbind-v3-0-2a694f95395b@oss.qualcomm.com/
- Validate mac (is_valid_ether_addr) before copying to output buffer
- Link to v2: https://lore.kernel.org/r/20260507-block-as-nvmem-v2-0-bf17edd5134e@oss.qualcomm.com
Changes in v2:
- Fix example nvmem-layout cells to use compatible = "mac-base"
- Squash WiFi MAC and Bluetooth BD address consumer patches into the nvmem layout patch
- Fix possible use-after-free in blk-nvmem: bnv (nvmem priv) linked to nvmem lifetime
- Simplify nvmem-cell-names from items: - const: to plain const:
- Factor out common NVMEM EUI-48 retrieval logic
- Reorder changes
- Link to v1: https://lore.kernel.org/r/20260428-block-as-nvmem-v1-0-6ad23e75190a@oss.qualcomm.com
---
Daniel Golle (1):
block: implement NVMEM provider
Loic Poulain (8):
dt-bindings: mmc: Document fixed-layout NVMEM provider support
dt-bindings: net: wireless: qcom,ath10k: Document NVMEM cells
dt-bindings: bluetooth: qcom: Add NVMEM BD address cell
nvmem: layouts: Support fixed-layout as the nvmem device node itself
net: of_net: Add of_get_nvmem_eui48() helper for EUI-48 lookup
Bluetooth: hci_sync: Add NVMEM-backed BD address retrieval
Bluetooth: qca: Set NVMEM BD address quirks when address is invalid
arm64: dts: qcom: arduino-imola: Describe NVMEM layout for WiFi/BT addresses
.../devicetree/bindings/mmc/mmc-card.yaml | 23 ++++-
.../net/bluetooth/qcom,bluetooth-common.yaml | 9 ++
.../bindings/net/wireless/qcom,ath10k.yaml | 16 +++
arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts | 32 ++++++
block/Kconfig | 11 +++
block/Makefile | 1 +
block/blk-nvmem.c | 110 +++++++++++++++++++++
block/blk.h | 8 ++
block/genhd.c | 4 +
drivers/bluetooth/btqca.c | 5 +-
drivers/nvmem/layouts.c | 13 ++-
include/linux/blk_types.h | 4 +
include/linux/of_net.h | 7 ++
include/net/bluetooth/hci.h | 18 ++++
net/bluetooth/hci_sync.c | 41 +++++++-
net/bluetooth/mgmt.c | 6 +-
net/core/of_net.c | 52 +++++++---
17 files changed, 342 insertions(+), 18 deletions(-)
---
base-commit: dffcfe75c722be66aa2669fb335528edb0590671
change-id: 20260428-block-as-nvmem-4b308e8bda9a
Best regards,
--
Loic Poulain <loic.poulain@oss.qualcomm.com>
^ permalink raw reply
* [PATCH v8 1/9] dt-bindings: mmc: Document fixed-layout NVMEM provider support
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
Allow an eMMC hardware partition node to describe an NVMEM layout so the
partition can be exposed as an NVMEM provider. This lets a partition
(e.g. an eMMC boot partition) store device-specific information such as a
WiFi MAC address or a Bluetooth BD address and reference it through NVMEM
cells.
Accept "fixed-layout" as the partition node compatible, in addition to
"fixed-partitions", so the layout can be described directly on the
partition node.
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
.../devicetree/bindings/mmc/mmc-card.yaml | 23 +++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/Documentation/devicetree/bindings/mmc/mmc-card.yaml b/Documentation/devicetree/bindings/mmc/mmc-card.yaml
index a61d6c96df759102f9c1fbfd548b026a77921cae..0422894508478c8d0ca68292b58a5fdbee218358 100644
--- a/Documentation/devicetree/bindings/mmc/mmc-card.yaml
+++ b/Documentation/devicetree/bindings/mmc/mmc-card.yaml
@@ -38,7 +38,9 @@ patternProperties:
properties:
compatible:
contains:
- const: fixed-partitions
+ enum:
+ - fixed-partitions
+ - fixed-layout
required:
- compatible
@@ -86,6 +88,25 @@ examples:
read-only;
};
};
+
+ partitions-boot2 {
+ compatible = "fixed-layout";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ mac-addr@4400 {
+ compatible = "mac-base";
+ reg = <0x4400 0x6>;
+ #nvmem-cell-cells = <1>;
+ };
+
+ bd-addr@5400 {
+ compatible = "mac-base";
+ reg = <0x5400 0x6>;
+ #nvmem-cell-cells = <1>;
+ };
+ };
};
};
--
2.34.1
^ permalink raw reply related
* [PATCH v8 2/9] dt-bindings: net: wireless: qcom,ath10k: Document NVMEM cells
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski, Krzysztof Kozlowski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
Document the NVMEM cells supported by the ath10k driver, the
mac-address, pre-calibration data, and calibration data.
Since such data may also originate from chipset OTP or be supplied
via other device tree structures. All of these cells are optional
and can be provided independently, in any combination.
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
.../devicetree/bindings/net/wireless/qcom,ath10k.yaml | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
index c21d66c7cd558ab792524be9afec8b79272d1c87..878c5d833a9cb073520c256c1b72d0f1489e7f4a 100644
--- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
@@ -92,6 +92,22 @@ properties:
ieee80211-freq-limit: true
+ nvmem-cells:
+ minItems: 1
+ maxItems: 3
+ description:
+ References to nvmem cells for MAC address and/or calibration data.
+ Supported cell names are mac-address, calibration, and pre-calibration.
+
+ nvmem-cell-names:
+ minItems: 1
+ maxItems: 3
+ items:
+ enum:
+ - mac-address
+ - calibration
+ - pre-calibration
+
qcom,calibration-data:
$ref: /schemas/types.yaml#/definitions/uint8-array
description:
--
2.34.1
^ permalink raw reply related
* [PATCH v8 4/9] nvmem: layouts: Support fixed-layout as the nvmem device node itself
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
of_nvmem_layout_get_container() only looks for a child node named
"nvmem-layout" to locate the cell definitions. This does not cover
providers whose device tree node is itself the fixed-layout container,
such as an eMMC boot partition block device whose fwnode points directly
at a "fixed-layout" compatible partitions node.
When no "nvmem-layout" child is present, fall back to returning the nvmem
device node itself if it is compatible with "fixed-layout", so that its
cells are parsed by nvmem_add_cells_from_fixed_layout().
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
drivers/nvmem/layouts.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
index b90584e1b99eab4217cbe7ec48373e18a7caf0dc..efa631ce7283bdd6c8ecda75915911b5e3a33c99 100644
--- a/drivers/nvmem/layouts.c
+++ b/drivers/nvmem/layouts.c
@@ -167,7 +167,18 @@ static int nvmem_layout_bus_populate(struct nvmem_device *nvmem,
struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
{
- return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
+ struct device_node *np;
+
+ /* Search for nvmem-layout child */
+ np = of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
+ if (np)
+ return np;
+
+ /* The nvmem of_node is itself a fixed-layout node */
+ if (of_device_is_compatible(nvmem->dev.of_node, "fixed-layout"))
+ return of_node_get(nvmem->dev.of_node);
+
+ return NULL;
}
EXPORT_SYMBOL_GPL(of_nvmem_layout_get_container);
--
2.34.1
^ permalink raw reply related
* [PATCH v8 5/9] block: implement NVMEM provider
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
From: Daniel Golle <daniel@makrotopia.org>
On embedded devices using an eMMC it is common that one or more partitions
on the eMMC are used to store MAC addresses and Wi-Fi calibration EEPROM
data. Allow referencing the partition in device tree for the kernel and
Wi-Fi drivers accessing it via the NVMEM layer.
NVMEM is registered for a block device whose OF node describes an NVMEM
layout, either via an "nvmem-layout" child or by being a "fixed-layout"
node itself (e.g. an eMMC boot partition associated through its mmc-card
node).
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Co-developed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
block/Kconfig | 11 +++++
block/Makefile | 1 +
block/blk-nvmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++
block/blk.h | 8 ++++
block/genhd.c | 4 ++
include/linux/blk_types.h | 4 ++
6 files changed, 138 insertions(+)
diff --git a/block/Kconfig b/block/Kconfig
index 15027963472d7b40e27b9097a5993c457b5b3054..69370ea2e268b9e5320477c747db4aab15abe3ff 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -209,6 +209,17 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.
+config BLK_NVMEM
+ bool "Block device NVMEM provider"
+ depends on OF
+ depends on NVMEM
+ help
+ Allow block devices (or partitions) to act as NVMEM providers,
+ exposing factory-provisioned data such as MAC addresses or Wi-Fi
+ calibration blobs to the drivers that consume them. This is
+ typically used on embedded devices where such data is stored in a
+ dedicated area of an eMMC, instead of a separate EEPROM or OTP.
+
source "block/partitions/Kconfig"
config BLK_PM
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c4274c11a0a61121c83d9c43d6e0c..d7ac389e71902bc091a8800ea266190a43b3e63d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
+obj-$(CONFIG_BLK_NVMEM) += blk-nvmem.o
diff --git a/block/blk-nvmem.c b/block/blk-nvmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..642a12f8a8b1c88e25baa1b684adef48c8c91c84
--- /dev/null
+++ b/block/blk-nvmem.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * block device NVMEM provider
+ *
+ * Copyright (c) 2024 Daniel Golle <daniel@makrotopia.org>
+ * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+ *
+ * Useful on devices using a partition on an eMMC for MAC addresses or
+ * Wi-Fi calibration EEPROM data.
+ */
+
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/nvmem-provider.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/of.h>
+#include <linux/pagemap.h>
+#include <linux/property.h>
+
+#include "blk.h"
+
+static int blk_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes)
+{
+ dev_t devt = (dev_t)(uintptr_t)priv;
+ size_t bytes_left = bytes;
+ loff_t pos = from;
+ int ret = 0;
+
+ struct file *bdev_file __free(fput) =
+ bdev_file_open_by_dev(devt, BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
+
+ while (bytes_left) {
+ pgoff_t f_index = pos >> PAGE_SHIFT;
+ struct folio *folio;
+ size_t folio_off;
+ size_t to_read;
+
+ folio = read_mapping_folio(bdev_file->f_mapping, f_index, NULL);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ folio_off = offset_in_folio(folio, pos);
+ to_read = min(bytes_left, folio_size(folio) - folio_off);
+ memcpy_from_folio(val, folio, folio_off, to_read);
+ pos += to_read;
+ bytes_left -= to_read;
+ val += to_read;
+ folio_put(folio);
+ }
+
+ return ret;
+}
+
+int blk_nvmem_add(struct block_device *bdev)
+{
+ struct device *dev = &bdev->bd_device;
+ struct nvmem_config config = {};
+ struct nvmem_device *nvmem;
+
+ /* skip devices which do not have a device tree node */
+ if (!dev_of_node(dev))
+ return 0;
+
+ /* skip devices without an nvmem layout defined */
+ struct device_node *child __free(device_node) =
+ of_get_child_by_name(dev_of_node(dev), "nvmem-layout");
+ if (!child && !of_device_is_compatible(dev_of_node(dev), "fixed-layout"))
+ return 0;
+
+ /*
+ * skip block device too large to be represented as NVMEM devices,
+ * nvmem_config.size is a signed int
+ */
+ if (bdev_nr_bytes(bdev) > INT_MAX) {
+ dev_warn(dev, "block device too large to be an NVMEM provider\n");
+ return 0;
+ }
+
+ config.id = NVMEM_DEVID_NONE;
+ config.dev = dev;
+ config.name = dev_name(dev);
+ config.owner = THIS_MODULE;
+ config.priv = (void *)(uintptr_t)dev->devt;
+ config.reg_read = blk_nvmem_reg_read;
+ config.size = bdev_nr_bytes(bdev);
+ config.word_size = 1;
+ config.stride = 1;
+ config.read_only = true;
+ config.root_only = true;
+ config.ignore_wp = true;
+ config.of_node = to_of_node(dev->fwnode);
+
+ nvmem = nvmem_register(&config);
+ if (IS_ERR(nvmem))
+ return dev_err_probe(dev, PTR_ERR(nvmem), "Failed to register NVMEM device\n");
+
+ bdev->bd_nvmem = nvmem;
+ return 0;
+}
+
+void blk_nvmem_del(struct block_device *bdev)
+{
+ nvmem_unregister(bdev->bd_nvmem);
+ bdev->bd_nvmem = NULL;
+}
diff --git a/block/blk.h b/block/blk.h
index ec4674cdf2ead4fd259ff5fc42401f591e684ee9..ed0c10168ba7be10855509637f824a9cea2b9ccb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -757,4 +757,12 @@ static inline void blk_debugfs_unlock(struct request_queue *q,
memalloc_noio_restore(memflags);
}
+#ifdef CONFIG_BLK_NVMEM
+int blk_nvmem_add(struct block_device *bdev);
+void blk_nvmem_del(struct block_device *bdev);
+#else
+static inline int blk_nvmem_add(struct block_device *bdev) { return 0; }
+static inline void blk_nvmem_del(struct block_device *bdev) {}
+#endif
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e95ae9134309679a7c6a937f5b7db8..1b2382de6fb30c1e5f60f45c04dc03ed3bf5d5f2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -421,6 +421,8 @@ static void add_disk_final(struct gendisk *disk)
*/
dev_set_uevent_suppress(ddev, 0);
disk_uevent(disk, KOBJ_ADD);
+
+ blk_nvmem_add(disk->part0);
}
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
@@ -704,6 +706,8 @@ static void __del_gendisk(struct gendisk *disk)
disk_del_events(disk);
+ blk_nvmem_del(disk->part0);
+
/*
* Prevent new openers by unlinked the bdev inode.
*/
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c09e0ceaac41ba59e86fb0c4efc64..6ed173c649025b95cce9253b27f68f2c7dbab8eb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -15,6 +15,7 @@
struct bio_set;
struct bio;
struct bio_integrity_payload;
+struct nvmem_device;
struct page;
struct io_context;
struct cgroup_subsys_state;
@@ -73,6 +74,9 @@ struct block_device {
int bd_writers;
#ifdef CONFIG_SECURITY
void *bd_security;
+#endif
+#ifdef CONFIG_BLK_NVMEM
+ struct nvmem_device *bd_nvmem;
#endif
/*
* keep this out-of-line as it's both big and not needed in the fast
--
2.34.1
^ permalink raw reply related
* [PATCH v8 6/9] net: of_net: Add of_get_nvmem_eui48() helper for EUI-48 lookup
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
Factor out the common NVMEM EUI-48 retrieval logic from
of_get_mac_address_nvmem() into a new of_get_nvmem_eui48() helper that
accepts the NVMEM cell name as a parameter. This allows other subsystems
(e.g. Bluetooth) to reuse the same lookup-validate-copy pattern with a
different cell name, without duplicating code.
of_get_mac_address_nvmem() is updated to call of_get_nvmem_eui48() with
"mac-address", preserving its existing behavior.
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
include/linux/of_net.h | 7 +++++++
net/core/of_net.c | 52 ++++++++++++++++++++++++++++++++++++++------------
2 files changed, 47 insertions(+), 12 deletions(-)
diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index d88715a0b3a52f87af23d47791bea3baf5be5200..7854ba555d9a55f3d020a37fe00a27ae52e0e5dc 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -15,6 +15,7 @@ struct net_device;
extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface);
extern int of_get_mac_address(struct device_node *np, u8 *mac);
extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac);
+int of_get_nvmem_eui48(struct device_node *np, const char *cell_name, u8 *addr);
int of_get_ethdev_address(struct device_node *np, struct net_device *dev);
extern struct net_device *of_find_net_device_by_node(struct device_node *np);
#else
@@ -34,6 +35,12 @@ static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac)
return -ENODEV;
}
+static inline int of_get_nvmem_eui48(struct device_node *np,
+ const char *cell_name, u8 *addr)
+{
+ return -ENODEV;
+}
+
static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
{
return -ENODEV;
diff --git a/net/core/of_net.c b/net/core/of_net.c
index 93ea425b9248a23f4f95a336e9cdbf0053248e32..999ca0e9258c41580823b80e0ef6db26617bc968 100644
--- a/net/core/of_net.c
+++ b/net/core/of_net.c
@@ -61,9 +61,7 @@ static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
{
struct platform_device *pdev = of_find_device_by_node(np);
- struct nvmem_cell *cell;
- const void *mac;
- size_t len;
+ u8 mac[ETH_ALEN] __aligned(sizeof(u16));
int ret;
/* Try lookup by device first, there might be a nvmem_cell_lookup
@@ -75,27 +73,57 @@ int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
return ret;
}
- cell = of_nvmem_cell_get(np, "mac-address");
+ ret = of_get_nvmem_eui48(np, "mac-address", mac);
+ if (ret)
+ return ret;
+
+ if (!is_valid_ether_addr(mac))
+ return -EINVAL;
+
+ memcpy(addr, mac, ETH_ALEN);
+ return 0;
+}
+EXPORT_SYMBOL(of_get_mac_address_nvmem);
+
+/**
+ * of_get_nvmem_eui48 - Read a 6-byte EUI-48 address from a named NVMEM cell.
+ * @np: Device node to look up the NVMEM cell from.
+ * @cell_name: Name of the NVMEM cell (e.g. "mac-address", "local-bd-address").
+ * @addr: Output buffer for the 6-byte address.
+ *
+ * Reads the named NVMEM cell and validates that it contains a non-zero 6-byte
+ * address. Returns 0 on success, negative errno on failure.
+ */
+int of_get_nvmem_eui48(struct device_node *np, const char *cell_name, u8 *addr)
+{
+ struct nvmem_cell *cell;
+ const void *eui48;
+ size_t len;
+
+ cell = of_nvmem_cell_get(np, cell_name);
if (IS_ERR(cell))
return PTR_ERR(cell);
- mac = nvmem_cell_read(cell, &len);
+ eui48 = nvmem_cell_read(cell, &len);
nvmem_cell_put(cell);
- if (IS_ERR(mac))
- return PTR_ERR(mac);
+ if (IS_ERR(eui48))
+ return PTR_ERR(eui48);
- if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
- kfree(mac);
+ /* Reject unprogrammed cells that read as all-zeroes or all-ones */
+ if (len != ETH_ALEN ||
+ !memchr_inv(eui48, 0x00, ETH_ALEN) ||
+ !memchr_inv(eui48, 0xff, ETH_ALEN)) {
+ kfree(eui48);
return -EINVAL;
}
- memcpy(addr, mac, ETH_ALEN);
- kfree(mac);
+ memcpy(addr, eui48, ETH_ALEN);
+ kfree(eui48);
return 0;
}
-EXPORT_SYMBOL(of_get_mac_address_nvmem);
+EXPORT_SYMBOL_GPL(of_get_nvmem_eui48);
/**
* of_get_mac_address()
--
2.34.1
^ permalink raw reply related
* [PATCH v8 3/9] dt-bindings: bluetooth: qcom: Add NVMEM BD address cell
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
Add support for an NVMEM cell provider for "local-bd-address",
allowing the Bluetooth stack to retrieve controller's BD address
from non-volatile storage such as an EEPROM or an eMMC partition.
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
.../devicetree/bindings/net/bluetooth/qcom,bluetooth-common.yaml | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/Documentation/devicetree/bindings/net/bluetooth/qcom,bluetooth-common.yaml b/Documentation/devicetree/bindings/net/bluetooth/qcom,bluetooth-common.yaml
index c8e9c55c1afb4c8e05ba2dae41ce2db4194b4a0f..7cb28f30c9af032082f23311f2fc89a32f266f17 100644
--- a/Documentation/devicetree/bindings/net/bluetooth/qcom,bluetooth-common.yaml
+++ b/Documentation/devicetree/bindings/net/bluetooth/qcom,bluetooth-common.yaml
@@ -22,4 +22,13 @@ properties:
description:
boot firmware is incorrectly passing the address in big-endian order
+ nvmem-cells:
+ maxItems: 1
+ description:
+ Nvmem data cell that contains a 6 byte BD address with the most
+ significant byte first (big-endian).
+
+ nvmem-cell-names:
+ const: local-bd-address
+
additionalProperties: true
--
2.34.1
^ permalink raw reply related
* [PATCH v8 7/9] Bluetooth: hci_sync: Add NVMEM-backed BD address retrieval
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski, Piotr Kwapulinski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
Some devices store the Bluetooth BD address in non-volatile
memory, which can be accessed through the NVMEM framework.
Similar to Ethernet or WiFi MAC addresses, add support for
reading the BD address from a 'local-bd-address' NVMEM cell.
As with the device-tree provided BD address, add a quirk to
indicate whether a device or platform should attempt to read
the address from NVMEM when no valid in-chip address is present.
Also add a quirk to indicate if the address is stored in
big-endian byte order.
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Reviewed-by: Piotr Kwapulinski <piotr.kwapulinski@intel.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
include/net/bluetooth/hci.h | 18 ++++++++++++++++++
net/bluetooth/hci_sync.c | 41 ++++++++++++++++++++++++++++++++++++++++-
net/bluetooth/mgmt.c | 6 ++++--
3 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 572b1c620c5d653a1fe10b26c1b0ba33e8f4968f..7686466d1109253b0d75edeb5f6a99fb98ce4cc6 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -164,6 +164,24 @@ enum {
*/
HCI_QUIRK_BDADDR_PROPERTY_BROKEN,
+ /* When this quirk is set, the public Bluetooth address
+ * initially reported by HCI Read BD Address command
+ * is considered invalid. The public BD Address can be
+ * retrieved via a 'local-bd-address' NVMEM cell.
+ *
+ * This quirk can be set before hci_register_dev is called or
+ * during the hdev->setup vendor callback.
+ */
+ HCI_QUIRK_USE_BDADDR_NVMEM,
+
+ /* When this quirk is set, the Bluetooth Device Address provided by
+ * the 'local-bd-address' NVMEM is stored in big-endian order.
+ *
+ * This quirk can be set before hci_register_dev is called or
+ * during the hdev->setup vendor callback.
+ */
+ HCI_QUIRK_BDADDR_NVMEM_BE,
+
/* When this quirk is set, the duplicate filtering during
* scanning is based on Bluetooth devices addresses. To allow
* RSSI based updates, restart scanning if needed.
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index fd3aacdea512a37c22b9a2be90c89ddca4b4d99f..56248d4abcb5b1d9993962a9f6bf60bf865b8d7b 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -7,6 +7,7 @@
*/
#include <linux/property.h>
+#include <linux/of_net.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -3588,6 +3589,39 @@ int hci_powered_update_sync(struct hci_dev *hdev)
return 0;
}
+/**
+ * hci_dev_get_bd_addr_from_nvmem - Get the Bluetooth Device Address
+ * (BD_ADDR) for a HCI device from
+ * an NVMEM cell.
+ * @hdev: The HCI device
+ *
+ * Search for 'local-bd-address' NVMEM cell in the device firmware node.
+ *
+ * All-zero BD addresses are rejected (unprovisioned).
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+static int hci_dev_get_bd_addr_from_nvmem(struct hci_dev *hdev)
+{
+ struct device_node *np = dev_of_node(hdev->dev.parent);
+ u8 ba[sizeof(bdaddr_t)];
+ int err;
+
+ if (!np)
+ return -ENODEV;
+
+ err = of_get_nvmem_eui48(np, "local-bd-address", ba);
+ if (err)
+ return err;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_NVMEM_BE))
+ baswap(&hdev->public_addr, (bdaddr_t *)ba);
+ else
+ bacpy(&hdev->public_addr, (bdaddr_t *)ba);
+
+ return 0;
+}
+
/**
* hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address
* (BD_ADDR) for a HCI device from
@@ -5042,12 +5076,17 @@ static int hci_dev_setup_sync(struct hci_dev *hdev)
* its setup callback.
*/
invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
- hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_NVMEM);
if (!ret) {
if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) &&
!bacmp(&hdev->public_addr, BDADDR_ANY))
hci_dev_get_bd_addr_from_property(hdev);
+ if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_NVMEM) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ hci_dev_get_bd_addr_from_nvmem(hdev);
+
if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) &&
hdev->set_bdaddr) {
ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index b05bb380e5f87715e2a4ac38aa7e7eb77c48549f..8f2a2d0f76adc3dd9cbe235d5ad0ed5e1667fcb6 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -617,7 +617,8 @@ static bool is_configured(struct hci_dev *hdev)
return false;
if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
- hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_NVMEM)) &&
!bacmp(&hdev->public_addr, BDADDR_ANY))
return false;
@@ -633,7 +634,8 @@ static __le32 get_missing_options(struct hci_dev *hdev)
options |= MGMT_OPTION_EXTERNAL_CONFIG;
if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
- hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_NVMEM)) &&
!bacmp(&hdev->public_addr, BDADDR_ANY))
options |= MGMT_OPTION_PUBLIC_ADDRESS;
--
2.34.1
^ permalink raw reply related
* [PATCH v8 8/9] Bluetooth: qca: Set NVMEM BD address quirks when address is invalid
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Bartosz Golaszewski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
When the controller BD address is invalid (zero or default),
set the NVMEM quirks to allow retrieving the address from a
'local-bd-address' NVMEM cell. The BD address is often stored
alongside the WiFi MAC address in big-endian format, so also
set the big-endian quirk.
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
drivers/bluetooth/btqca.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index dda76365726f0bfe0e80e05fe04859fa4f0592e1..df33eacfd29fa680f393f90215150743e6001d5b 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -721,8 +721,11 @@ static int qca_check_bdaddr(struct hci_dev *hdev, const struct qca_fw_config *co
}
bda = (struct hci_rp_read_bd_addr *)skb->data;
- if (!bacmp(&bda->bdaddr, &config->bdaddr))
+ if (!bacmp(&bda->bdaddr, &config->bdaddr)) {
hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
+ hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_NVMEM);
+ hci_set_quirk(hdev, HCI_QUIRK_BDADDR_NVMEM_BE);
+ }
kfree_skb(skb);
--
2.34.1
^ permalink raw reply related
* [PATCH v8 9/9] arm64: dts: qcom: arduino-imola: Describe NVMEM layout for WiFi/BT addresses
From: Loic Poulain @ 2026-07-03 13:45 UTC (permalink / raw)
To: Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
Russell King, Saravana Kannan, Christian Marangi
Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
Loic Poulain, Konrad Dybcio, Bartosz Golaszewski
In-Reply-To: <20260703-block-as-nvmem-v8-0-98ae32bfc49a@oss.qualcomm.com>
On Arduino Uno-Q, the eMMC boot1 partition is factory provisioned
with device-specific information such as the WiFi MAC address
and the Bluetooth BD address. This partition can serve as an
alternative to additional non-volatile memory, such as a
dedicated EEPROM.
The eMMC boot partitions are typically good candidates, as they
are relatively small, read-only by default (and can be enforced
as hardware read-only), and are not affected by board reflashing
procedures, which generally target the eMMC user or GP partitions.
Describe the corresponding nvmem-layout for the WiFi and Bluetooth
addresses, and point the WiFi and Bluetooth nodes to the appropriate
NVMEM cells to retrieve them.
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
---
arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts | 32 ++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts b/arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts
index bf088fa9807f040f0c8f405f9111b01790b09377..38839b8a361e76f6c1989924b16095b9d8815f66 100644
--- a/arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts
+++ b/arch/arm64/boot/dts/qcom/qrb2210-arduino-imola.dts
@@ -409,7 +409,33 @@ &sdhc_1 {
no-sdio;
no-sd;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
status = "okay";
+
+ card@0 {
+ compatible = "mmc-card";
+ reg = <0>;
+
+ partitions-boot1 {
+ compatible = "fixed-layout";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ wifi_mac_addr: mac-addr@4400 {
+ compatible = "mac-base";
+ reg = <0x4400 0x6>;
+ #nvmem-cell-cells = <1>;
+ };
+
+ bd_addr: bd-addr@5400 {
+ compatible = "mac-base";
+ reg = <0x5400 0x6>;
+ #nvmem-cell-cells = <1>;
+ };
+ };
+ };
};
&spi5 {
@@ -512,6 +538,9 @@ bluetooth {
vddch0-supply = <&pm4125_l22>;
enable-gpios = <&tlmm 87 GPIO_ACTIVE_HIGH>;
max-speed = <3000000>;
+
+ nvmem-cells = <&bd_addr 0>;
+ nvmem-cell-names = "local-bd-address";
};
};
@@ -557,6 +586,9 @@ &wifi {
qcom,ath10k-calibration-variant = "ArduinoImola";
firmware-name = "qcm2290";
+ nvmem-cells = <&wifi_mac_addr 0>;
+ nvmem-cell-names = "mac-address";
+
status = "okay";
};
--
2.34.1
^ permalink raw reply related
* [PATCH] block: fix BLKSECDISCARD zero-length range causing page cache invalidation
From: Zizhi Wo @ 2026-07-04 7:39 UTC (permalink / raw)
To: axboe, linux-block; +Cc: linux-kernel, yangerkun, chengzhihao1, wozizhi
From: Zizhi Wo <wozizhi@huawei.com>
Commit 697ba0b6ec4a ("block: fix integer overflow in BLKSECDISCARD") fixed
the start+len overflow via check_add_overflow() but did not handle the
start=0, len=0 case. There, start + len = 0, so end = 0 passes all checks,
and truncate_bdev_range()->truncate_inode_pages_range() is then called with
lend=UINT64_MAX, whitch is the "truncate to the end of file" sentinel, so
the entire page cache is invalidated.
Fix this by replacing the validation with blk_validate_byte_range(), which
already rejects a zero-length range and is what BLKDISCARD uses. This also
switches the alignment check from a hardcoded 512 to
bdev_logical_block_size().
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
block/ioctl.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/block/ioctl.c b/block/ioctl.c
index 3d4ea1537457..3b7d33a737e8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -176,8 +176,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
void __user *argp)
{
- uint64_t start, len, end;
- uint64_t range[2];
+ uint64_t range[2], start, len;
int err;
if (!(mode & BLK_OPEN_WRITE))
@@ -189,15 +188,13 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
start = range[0];
len = range[1];
- if ((start & 511) || (len & 511))
- return -EINVAL;
- if (check_add_overflow(start, len, &end) ||
- end > bdev_nr_bytes(bdev))
- return -EINVAL;
+ err = blk_validate_byte_range(bdev, start, len);
+ if (err)
+ return err;
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
- err = truncate_bdev_range(bdev, mode, start, end - 1);
+ err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
--
2.52.0
^ permalink raw reply related
* [PATCH] null_blk: initialize lock mutex before registering configfs subsystem
From: Zizhi Wo @ 2026-07-04 9:23 UTC (permalink / raw)
To: axboe, dlemoal, nilay, linux-block, kch, johannes.thumshirn,
kbusch
Cc: linux-kernel, yangerkun, chengzhihao1, wozizhi
From: Zizhi Wo <wozizhi@huawei.com>
In null_init(), mutex_init(&lock) currently happens after
configfs_register_subsystem(), which exposes the nullb subsystem to
userspace. A racing mkdir() into /sys/kernel/config/nullb/ can reach
null_find_dev_by_name() -> mutex_lock(&lock) before the mutex is
initialized, trigger warning:
[ 123.137788] DEBUG_LOCKS_WARN_ON(lock->magic != lock)
[ 123.137796] WARNING: kernel/locking/mutex.c:159 at mutex_lock+0x171/0x1c0, CPU#13: mkdir/1301
[ 123.140090] Modules linked in: null_blk(+) nft_fib_inet nft_fib_ipv4
......
[ 123.154926] Call Trace:
[ 123.155172] <TASK>
[ 123.155419] ? __pfx_mutex_lock+0x10/0x10
[ 123.156181] ? __pfx__raw_spin_lock+0x10/0x10
[ 123.156571] nullb_group_make_group+0x20/0x100 [null_blk]
[ 123.157011] configfs_mkdir+0x47b/0xc70
[ 123.157337] ? __pfx_configfs_mkdir+0x10/0x10
[ 123.157719] ? may_create_dentry+0x242/0x2e0
[ 123.158061] vfs_mkdir+0x2a9/0x6c0
[ 123.158352] filename_mkdirat+0x3dc/0x500
[ 123.158710] ? __pfx_filename_mkdirat+0x10/0x10
[ 123.159070] ? strncpy_from_user+0x3a/0x1d0
[ 123.159413] __x64_sys_mkdir+0x6b/0x90
[ 123.159760] do_syscall_64+0xea/0x600
Move mutex_init(&lock) before configfs_register_subsystem().
Fixes: 49c3b9266a71 ("block: null_blk: Improve device creation with configfs")
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
drivers/block/null_blk/main.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index f8c0fd57e041..98616d290c5c 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -2161,13 +2161,12 @@ static int __init null_init(void)
config_group_init(&nullb_subsys.su_group);
mutex_init(&nullb_subsys.su_mutex);
+ mutex_init(&lock);
ret = configfs_register_subsystem(&nullb_subsys);
if (ret)
return ret;
- mutex_init(&lock);
-
null_major = register_blkdev(0, "nullb");
if (null_major < 0) {
ret = null_major;
--
2.52.0
^ permalink raw reply related
* Re: [PATCH] null_blk: initialize lock mutex before registering configfs subsystem
From: Bart Van Assche @ 2026-07-04 13:25 UTC (permalink / raw)
To: Zizhi Wo, axboe, dlemoal, nilay, linux-block, kch,
johannes.thumshirn, kbusch
Cc: linux-kernel, yangerkun, chengzhihao1, wozizhi
In-Reply-To: <20260704092323.748772-1-wozizhi@huaweicloud.com>
> diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
> index f8c0fd57e041..98616d290c5c 100644
> --- a/drivers/block/null_blk/main.c
> +++ b/drivers/block/null_blk/main.c
> @@ -2161,13 +2161,12 @@ static int __init null_init(void)
>
> config_group_init(&nullb_subsys.su_group);
> mutex_init(&nullb_subsys.su_mutex);
> + mutex_init(&lock);
>
> ret = configfs_register_subsystem(&nullb_subsys);
> if (ret)
> return ret;
>
> - mutex_init(&lock);
> -
> null_major = register_blkdev(0, "nullb");
> if (null_major < 0) {
> ret = null_major;
Why an explicit mutex_init() call instead of changing "static struct
mutex lock" into "static DEFINE_MUTEX(lock)"? Additionally, shouldn't
the "lock" name be made more descriptive since it has file scope?
Thanks,
Bart.
^ permalink raw reply
* [PATCH v5] badblocks: fix infinite loop due to incorrect rounding and overflow
From: Ramesh Adhikari @ 2026-07-04 17:13 UTC (permalink / raw)
To: axboe, gregkh; +Cc: linux-block, lkp, Ramesh Adhikari
In-Reply-To: <20260427151048.756072-1-adhikari.resume@gmail.com>
The roundup() and rounddown() macros return the rounded value but
do not modify their input in place. In _badblocks_set(),
_badblocks_clear(), and badblocks_check(), the return values were
being discarded, so s and target/next remained unrounded. Sectors
were then calculated from these unrounded values, which could make
sectors way too large (or zero), causing infinite loops in the
re_insert/re_clear/re_check loops.
This was confirmed with local syzkaller fuzzing against the nvdimm
ioctl path (ND_IOCTL_CLEAR_ERROR -> nvdimm_clear_badblocks_region()
-> badblocks_clear()), which reliably produces RCU stalls with the
looping task caught mid-loop:
rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
rcu: rcu_preempt kthread starved for 21001 jiffies! g40229 f0x0 RCU_GP_WAIT_FQS(5)
...
Call Trace:
badblocks_clear+0x259/0xb10
nvdimm_clear_badblocks_region+0x165/0x1e0 [libnvdimm]
device_for_each_child+0x11e/0x1a0
nd_ioctl+0x1413/0x1750 [libnvdimm]
__x64_sys_ioctl+0x18e/0x210
do_syscall_64+0x102/0x5a0
Fix this by properly capturing the return values of round_up()/
round_down() (the power-of-two variants, since 1 << bb->shift is
always a power of two -- roundup()/rounddown() use a division/
modulo internally, which is undesirable here). Also add overflow
checks (s > ULLONG_MAX - sectors) before the s + sectors addition
in all three functions, and handle the case where sectors becomes
zero after rounding.
The overflow check is done unconditionally, before the bb->shift
rounding block, rather than inside it. bb->shift == 0 is not just
an initial state -- __badblocks_init() sets it to 0 by default, and
drivers/md/md.c explicitly sets rdev->badblocks.shift back to 0 in
several paths -- so s + sectors needs the same overflow guard
whether or not rounding happens.
Signed-off-by: Ramesh Adhikari <adhikari.resume@gmail.com>
---
v1-v3 chased individual len==0 symptoms in _badblocks_clear()/
_badblocks_check() one call site at a time. Jens pointed out that
approach wasn't finding the actual bug, just papering over spots as
they were noticed. v4 was a full rewrite around the real root cause
(roundup()/rounddown() discarding their return values), covering all
three functions with proper overflow handling.
Changes in v5:
- Switch from roundup()/rounddown() (which use division/modulo on
sector_t, a u64) to round_up()/round_down() (bitmask-based, since
1 << bb->shift is always a power of two). Fixes the v4 build
failure kernel test robot reported on 32-bit (undefined reference
to __aeabi_uldivmod on ARM, __umoddi3 on i386).
- Move the s > ULLONG_MAX - sectors overflow check so it runs
unconditionally in all three functions, instead of only inside
the `if (bb->shift)` block. bb->shift == 0 is a real, common
state (default init value, and explicitly set by drivers/md/md.c
in several paths), so the overflow guard needs to apply there
too, not just when rounding is active.
- Build-tested locally (x86_64) and confirmed no residual div/mod
symbols in the object file.
Link to v4: https://lore.kernel.org/r/20260427151048.756072-1-adhikari.resume@gmail.com
block/badblocks.c | 40 ++++++++++++++++++++++++++++++++--------
1 file changed, 32 insertions(+), 8 deletions(-)
diff --git a/block/badblocks.c b/block/badblocks.c
index ece64e76fe8..e8484912532 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -853,15 +853,21 @@ static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,
/* Invalid sectors number */
return false;
+ if (s > ULLONG_MAX - sectors)
+ return false;
+
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- rounddown(s, 1 << bb->shift);
- roundup(next, 1 << bb->shift);
+ s = round_down(s, 1 << bb->shift);
+ next = round_up(next, 1 << bb->shift);
sectors = next - s;
}
+ if (sectors == 0)
+ return false;
+
write_seqlock_irqsave(&bb->lock, flags);
bad.ack = acknowledged;
@@ -1061,6 +1067,9 @@ static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
/* Invalid sectors number */
return false;
+ if (s > ULLONG_MAX - sectors)
+ return false;
+
if (bb->shift) {
sector_t target;
@@ -1071,11 +1080,17 @@ static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
* isn't than to think a block is not bad when it is.
*/
target = s + sectors;
- roundup(s, 1 << bb->shift);
- rounddown(target, 1 << bb->shift);
- sectors = target - s;
+ s = round_up(s, 1 << bb->shift);
+ target = round_down(target, 1 << bb->shift);
+ if (target < s)
+ sectors = 0;
+ else
+ sectors = target - s;
}
+ if (sectors == 0)
+ return false;
+
write_seqlock_irq(&bb->lock);
bad.ack = true;
@@ -1303,13 +1318,22 @@ int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,
WARN_ON(bb->shift < 0 || sectors == 0);
+ if (s > ULLONG_MAX - sectors)
+ return -EINVAL;
+
if (bb->shift > 0) {
/* round the start down, and the end up */
sector_t target = s + sectors;
- rounddown(s, 1 << bb->shift);
- roundup(target, 1 << bb->shift);
- sectors = target - s;
+ s = round_down(s, 1 << bb->shift);
+ target = round_up(target, 1 << bb->shift);
+ if (target < s)
+ sectors = 0;
+ else
+ sectors = target - s;
+
+ if (sectors == 0)
+ return 0;
}
retry:
--
2.43.0
^ permalink raw reply related
* [RFC PATCH v1 00/17] blk-cgroup: protect blkgs with blkcg_mutex
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
From: Yu Kuai <yukuai@fygo.io>
This RFC moves queue-local blkg topology synchronization from
q->queue_lock to q->blkcg_mutex.
q->queue_lock is a hot block-layer spinlock used by request queue runtime
paths, and it is also used in irq-disabled or otherwise atomic contexts.
Using it to protect blkg topology makes blkg lookup, creation,
destruction, policy activation, and policy-state walks inherit those atomic
locking constraints. That forces awkward preallocation schemes such as
radix-tree preloading and prevents missing-blkg creation from sleeping,
even though blkg creation is a blkcg control-plane operation rather than a
queue dispatch fast-path operation.
q->blkcg_mutex is a better fit for blkg protection because it is already a
queue-local blkcg lock, it can serialize the full lookup/create/destroy and
policy activation path, and it allows allocation and parent lookup to run
from sleepable contexts. Moving blkg topology under q->blkcg_mutex also
separates blkcg topology from queue runtime locking, reducing queue_lock
scope and making the locking rules for blkcg policy users explicit.
bio_set_dev() and bio allocation with a bdev can associate a bio with the
destination queue's blkg. Once missing blkg creation is serialized by
q->blkcg_mutex, those helpers may sleep when they create a blkg. The first
part of the series therefore audits callers that can reach these helpers
from completion, spinlocked, irq-disabled, GFP_NOWAIT, or other
non-blocking paths, and either moves association to process context or uses
a nowait association path that avoids sleeping.
The preparatory patches cover NVMe multipath requeue, dm-thin and
dm-snapshot map paths, blk-throttle's private runtime lock, atomic bio
allocation helpers, bcache, dm-bufio, dm-pcache, DM NOWAIT clones/remaps,
and BFQ's locked cgroup update path. The final blkcg patches then move
blkg lookup/create/destroy, policy activation, and configuration
preparation to q->blkcg_mutex; remove radix-tree preloading; move blkg
allocation into blkg_create(); and share creation code between bio
association and config preparation.
This is RFC because the locking conversion changes a central blkcg lifetime
path and relies on all non-sleepable bio association users either being
converted or tolerating nowait association failure.
One intentional tradeoff is left in the nowait paths. They first associate
with an existing blkg. If a thread issues IO to a queue for the first time
from a GFP_NOWAIT or otherwise non-blocking path, the cgroup's blkg for
that queue may not exist yet. After blkg topology moves to q->blkcg_mutex,
preemptible task-context callers try q->blkcg_mutex and attempt blkg
creation. Once allocation moves into blkg_create(), that opportunistic
nowait creation uses GFP_ATOMIC. If the caller is in atomic context,
q->blkcg_mutex is contended, or allocation fails, the nowait helper still
fails and the caller needs to retry from a blocking context, defer the
association, or fall back to an existing slow path.
Patch layout:
Patch 1: move NVMe multipath failover bio retargeting to requeue work so
bio_set_dev() runs from process context instead of completion context.
Patches 2-3: remove or avoid bio_set_dev() while dm-thin and dm-snapshot
locks are held, and restore blkcg association later where needed.
Patch 4: give blk-throttle its own runtime-state lock so blkcg topology
can be moved away from queue_lock.
Patches 5-7: add bio_alloc_atomic(), make bio association nowait-aware,
and make bio allocation with a bdev fail rather than sleep for
non-blocking callers.
Patches 8-12: convert bcache, dm-bufio, dm-pcache, block helper
allocations, and DM NOWAIT remaps/clones to the new nowait or deferred
association model.
Patch 13: avoid a sleeping blkg lookup from BFQ while bfqd->lock is held.
Patch 14: protect queue-local blkg lookup, creation, destruction, policy
activation, and policy state walks with q->blkcg_mutex. This also makes
preemptible nowait bio association try q->blkcg_mutex instead of failing
immediately after an RCU lookup miss.
Patch 15: remove radix-tree preloading after blkg creation no longer runs
under queue_lock.
Patch 16: allocate blkgs inside blkg_create() and use GFP_ATOMIC for the
nowait bio-association trylock creation path.
Patch 17: share blkg creation between bio association and config
preparation.
Yu Kuai (17):
nvme-multipath: retarget failedover bios from requeue work
dm thin: avoid bio_set_dev under pool lock
dm snapshot: avoid bio_set_dev in locked map paths
blk-throttle: protect throttle state with td lock
block: add bio_alloc_atomic() for atomic bio users
blk-cgroup: support non-blocking bio association
block: support non-blocking bio allocation with a bdev
bcache: avoid sleeping blkg association from locked paths
dm bufio: avoid blkg association from GFP_NOWAIT bio init
dm pcache: handle non-blocking bio clone init failure
block: avoid scheduling from non-blocking helper allocations
dm: avoid sleeping blkg association from NOWAIT remaps
bfq: avoid blkg lookup from locked cgroup update
blk-cgroup: protect blkgs with blkcg_mutex
blk-cgroup: remove blkg radix tree preloading
blk-cgroup: allocate blkgs in blkg_create
blk-cgroup: share blkg creation between lookup and config prep
block/bfq-cgroup.c | 26 +-
block/bio.c | 50 +++-
block/blk-cgroup.c | 397 ++++++++++++-----------------
block/blk-cgroup.h | 16 +-
block/blk-crypto-fallback.c | 2 +-
block/blk-iocost.c | 5 +-
block/blk-iolatency.c | 7 +-
block/blk-lib.c | 3 +-
block/blk-map.c | 7 +-
block/blk-throttle.c | 93 +++++--
drivers/md/bcache/journal.c | 9 +-
drivers/md/bcache/request.c | 4 +-
drivers/md/dm-bufio.c | 9 +-
drivers/md/dm-linear.c | 2 +-
drivers/md/dm-pcache/backing_dev.c | 10 +-
drivers/md/dm-snap.c | 29 ++-
drivers/md/dm-stripe.c | 6 +-
drivers/md/dm-switch.c | 2 +-
drivers/md/dm-thin.c | 3 -
drivers/md/dm-unstripe.c | 2 +-
drivers/md/dm.c | 28 +-
drivers/md/md.c | 2 +-
drivers/nvdimm/nd_virtio.c | 11 +-
drivers/nvme/host/multipath.c | 4 +-
fs/gfs2/lops.c | 3 +-
fs/ocfs2/cluster/heartbeat.c | 15 +-
include/linux/bio.h | 53 ++--
include/linux/device-mapper.h | 8 +
include/linux/writeback.h | 2 +-
mm/page_io.c | 2 +-
30 files changed, 467 insertions(+), 343 deletions(-)
base-commit: a1c8bdbbd72564cebb0d02948c1ed57b80b2e773
--
2.51.0
^ permalink raw reply
* [RFC PATCH v1 01/17] nvme-multipath: retarget failedover bios from requeue work
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
bio_set_dev() is about to become explicitly sleepable because it can
associate the bio with a blkg for the destination queue. NVMe failover
can run from request completion context, and nvme_failover_req() also holds
head->requeue_lock with interrupts disabled while it steals bios from the
failed request. Calling bio_set_dev() there is not safe once the helper is
allowed to sleep.
The requeue lock only protects head->requeue_list. Keep the list
manipulation under that lock, but defer retargeting to nvme_requeue_work(),
which already drains the list from process context before resubmitting each
bio. The bios remain private to the requeue list until the worker pops
them, so moving the device switch there preserves the existing retry flow
while avoiding a sleepable helper in completion context.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
drivers/nvme/host/multipath.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 9b9a657fa330..76baa180ae1c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -149,7 +149,6 @@ void nvme_failover_req(struct request *req)
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
unsigned long flags;
- struct bio *bio;
nvme_mpath_clear_current_path(ns);
atomic_long_inc(&ns->failover);
@@ -165,8 +164,6 @@ void nvme_failover_req(struct request *req)
}
spin_lock_irqsave(&ns->head->requeue_lock, flags);
- for (bio = req->bio; bio; bio = bio->bi_next)
- bio_set_dev(bio, ns->head->disk->part0);
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
@@ -684,6 +681,7 @@ static void nvme_requeue_work(struct work_struct *work)
next = bio->bi_next;
bio->bi_next = NULL;
+ bio_set_dev(bio, head->disk->part0);
submit_bio_noacct(bio);
}
}
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 02/17] dm thin: avoid bio_set_dev under pool lock
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
bio_set_dev() is about to become explicitly sleepable because it can
associate the bio with a blkg for the destination queue. pool_map()
calls bio_set_dev() while holding pool->lock with interrupts disabled,
which would be invalid once bio_set_dev() may sleep.
The lock is not needed in this map path. The pool target is a singleton
mapping and pool_map() only reads pt->data_dev, which is a target-private
device reference acquired during construction and released during target
destruction. It does not inspect or modify pool state protected by
pool->lock.
Remove the lock so the remap stays in the normal sleepable DM map context
while the data device pointer remains stable for the table lifetime.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
drivers/md/dm-thin.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 59392de7a477..358ed77ffb2b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3438,14 +3438,11 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
static int pool_map(struct dm_target *ti, struct bio *bio)
{
struct pool_c *pt = ti->private;
- struct pool *pool = pt->pool;
/*
* As this is a singleton target, ti->begin is always zero.
*/
- spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
- spin_unlock_irq(&pool->lock);
return DM_MAPIO_REMAPPED;
}
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 03/17] dm snapshot: avoid bio_set_dev in locked map paths
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
bio_set_dev() is about to become explicitly sleepable. It currently
updates the bio's target device and then associates the bio with the
destination queue's blkcg state. After blkcg lookup/creation is moved
under the queue's blkcg_mutex, that association may take blkcg_mutex and
allocate a new blkg. Callers therefore must not invoke bio_set_dev() from
atomic or otherwise non-sleepable sections.
snapshot_map() has several remap decisions inside
dm_exception_table_lock(), which nests the completed and pending
exception hash-table spinlocks. Those locks protect the lookup result,
pending-exception insertion, pe->started, and the pending bio lists until
the bio has either been returned to DM core or queued on the pending
exception. Dropping the locks just to call bio_set_dev() would require
revalidating the exception state and preserving the pending-list ordering
rules; calling a sleepable bio_set_dev() while holding the spinlocks is not
allowed either.
Split out snapshot_bio_set_dev() for these locked remap decisions. It only
performs the non-sleeping part of bio_set_dev(): clear BIO_REMAPPED, clear
BIO_BPS_THROTTLED when the bdev changes, and update bi_bdev. It
deliberately does not associate the bio with a blkg while snapshot locks
are held.
This does not lose blkcg attribution for the normal DM_MAPIO_REMAPPED case.
After the target returns, DM core submits the mapped bio through
dm_submit_bio_remap(), and that helper clones the blkg association from the
original bio in the normal submission context.
Some snapshot bios are not submitted by DM core immediately. Writes
waiting for a pending exception and bios queued during snapshot merge are
kept on snapshot-owned lists and submitted later after copy or merge
completion. Once bio_set_dev() is no longer used in the locked path,
these delayed bios also need their blkcg association restored at submission
time. Submit those bios through dm_submit_bio_remap() instead of
submit_bio_noacct() so the association is cloned from the original bio
after the snapshot locks have been released.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
drivers/md/dm-snap.c | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1489fda9d24a..373a94156ec7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -192,6 +192,19 @@ static sector_t chunk_to_sector(struct dm_exception_store *store,
return chunk << store->chunk_shift;
}
+/*
+ * Snapshot exception-table locks are spinlocks. Only update the target
+ * device while holding them; dm_submit_bio_remap() will associate target-owned
+ * bios with the original bio's blkg from a sleepable submission context.
+ */
+static void snapshot_bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+ bio_clear_flag(bio, BIO_REMAPPED);
+ if (bio->bi_bdev != bdev)
+ bio_clear_flag(bio, BIO_BPS_THROTTLED);
+ bio->bi_bdev = bdev;
+}
+
static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
{
/*
@@ -1566,7 +1579,7 @@ static void flush_bios(struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- submit_bio_noacct(bio);
+ dm_submit_bio_remap(bio, NULL);
bio = n;
}
}
@@ -1586,7 +1599,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
bio->bi_next = NULL;
r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
- submit_bio_noacct(bio);
+ dm_submit_bio_remap(bio, NULL);
bio = n;
}
}
@@ -1827,7 +1840,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
bio->bi_end_io = full_bio_end_io;
bio->bi_private = callback_data;
- submit_bio_noacct(bio);
+ dm_submit_bio_remap(bio, NULL);
}
static struct dm_snap_pending_exception *
@@ -1898,7 +1911,7 @@ __find_pending_exception(struct dm_snapshot *s,
static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
struct bio *bio, chunk_t chunk)
{
- bio_set_dev(bio, s->cow->bdev);
+ snapshot_bio_set_dev(bio, s->cow->bdev);
bio->bi_iter.bi_sector =
chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
(chunk - e->old_chunk)) +
@@ -1982,7 +1995,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
* defeat the goal of freeing space in origin that is
* implied by the "discard_passdown_origin" feature)
*/
- bio_set_dev(bio, s->origin->bdev);
+ snapshot_bio_set_dev(bio, s->origin->bdev);
track_chunk(s, bio, chunk);
goto out_unlock;
}
@@ -2081,7 +2094,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
goto out;
}
} else {
- bio_set_dev(bio, s->origin->bdev);
+ snapshot_bio_set_dev(bio, s->origin->bdev);
track_chunk(s, bio, chunk);
}
@@ -2143,7 +2156,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
chunk >= s->first_merging_chunk &&
chunk < (s->first_merging_chunk +
s->num_merging_chunks)) {
- bio_set_dev(bio, s->origin->bdev);
+ snapshot_bio_set_dev(bio, s->origin->bdev);
bio_list_add(&s->bios_queued_during_merge, bio);
r = DM_MAPIO_SUBMITTED;
goto out_unlock;
@@ -2157,7 +2170,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
}
redirect_to_origin:
- bio_set_dev(bio, s->origin->bdev);
+ snapshot_bio_set_dev(bio, s->origin->bdev);
if (bio_data_dir(bio) == WRITE) {
up_write(&s->lock);
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 04/17] blk-throttle: protect throttle state with td lock
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
Throttle currently uses queue_lock for both blkcg topology and its own
runtime state. This blocks moving blkg topology protection to blkcg_mutex
cleanly.
Add a throttle-private spinlock and use it for throttle service queues,
pending timers, runtime counters and config updates. Keep queue_lock only
where the current intermediate code still walks blkcg topology.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
block/blk-throttle.c | 87 ++++++++++++++++++++++++++++++++++----------
1 file changed, 67 insertions(+), 20 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ffc3b70065d4..7bca2805404f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -30,6 +30,9 @@ static struct workqueue_struct *kthrotld_workqueue;
struct throtl_data
{
+ /* protects throttle service queues and group runtime state */
+ spinlock_t lock;
+
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
@@ -346,11 +349,16 @@ static void tg_update_has_rules(struct throtl_grp *tg)
static void throtl_pd_online(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
+ struct throtl_data *td = tg->td;
+ unsigned long flags;
+
+ spin_lock_irqsave(&td->lock, flags);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
tg_update_has_rules(tg);
+ spin_unlock_irqrestore(&td->lock, flags);
}
static void tg_release(struct rcu_head *rcu)
@@ -368,7 +376,7 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
- timer_delete_sync(&tg->service_queue.pending_timer);
+ timer_shutdown_sync(&tg->service_queue.pending_timer);
call_rcu(&pd->rcu_head, tg_release);
}
@@ -1142,9 +1150,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
else
q = td->queue;
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
- if (!q->root_blkg)
+ if (!READ_ONCE(q->root_blkg))
goto out_unlock;
again:
@@ -1168,9 +1176,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
break;
/* this dispatch windows is still open, relax and repeat */
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
cpu_relax();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
}
if (!dispatched)
@@ -1193,7 +1201,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
}
/**
@@ -1209,7 +1217,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
struct throtl_data *td = container_of(work, struct throtl_data,
dispatch_work);
struct throtl_service_queue *td_sq = &td->service_queue;
- struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
@@ -1217,11 +1224,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
bio_list_init(&bio_list_on_stack);
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
@@ -1299,7 +1306,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
rcu_read_unlock();
/*
- * We're already holding queue_lock and know @tg is valid. Let's
+ * We're already holding td->lock and know @tg is valid. Let's
* apply the new config directly.
*
* Restart the slices for both READ and WRITES. It might happen
@@ -1327,6 +1334,7 @@ static int blk_throtl_init(struct gendisk *disk)
return -ENOMEM;
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ spin_lock_init(&td->lock);
throtl_service_queue_init(&td->service_queue);
memflags = blk_mq_freeze_queue(disk->queue);
@@ -1381,6 +1389,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
+ spin_lock_irq(&tg->td->lock);
tg_update_carryover(tg);
if (is_u64)
@@ -1389,6 +1398,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
tg_conf_updated(tg, false);
+ spin_unlock_irq(&tg->td->lock);
ret = 0;
unprep:
@@ -1563,6 +1573,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto close_bdev;
tg = blkg_to_tg(ctx.blkg);
+ spin_lock_irq(&tg->td->lock);
tg_update_carryover(tg);
v[0] = tg->bps[READ];
@@ -1586,11 +1597,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
p = tok;
strsep(&p, "=");
if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
- goto unprep;
+ goto unlock;
ret = -ERANGE;
if (!val)
- goto unprep;
+ goto unlock;
ret = -EINVAL;
if (!strcmp(tok, "rbps"))
@@ -1602,7 +1613,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
else
- goto unprep;
+ goto unlock;
}
tg->bps[READ] = v[0];
@@ -1611,7 +1622,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg->iops[WRITE] = v[3];
tg_conf_updated(tg, false);
+ spin_unlock_irq(&tg->td->lock);
ret = 0;
+ goto unprep;
+unlock:
+ spin_unlock_irq(&tg->td->lock);
unprep:
blkg_conf_unprep(&ctx);
close_bdev:
@@ -1636,6 +1651,28 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
+static void throtl_shutdown_timers(struct request_queue *q)
+{
+ struct throtl_data *td = q->td;
+ struct blkcg_gq *blkg;
+
+ /*
+ * blkg_destroy_all() has already offlined the policy, but blkg policy
+ * data is freed asynchronously. Shut down per-group timers before
+ * freeing td, as their callbacks still dereference tg->td.
+ */
+ mutex_lock(&q->blkcg_mutex);
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg)
+ timer_shutdown_sync(&tg->service_queue.pending_timer);
+ }
+ mutex_unlock(&q->blkcg_mutex);
+
+ timer_shutdown_sync(&td->service_queue.pending_timer);
+}
+
static void tg_flush_bios(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
@@ -1669,7 +1706,13 @@ static void tg_flush_bios(struct throtl_grp *tg)
static void throtl_pd_offline(struct blkg_policy_data *pd)
{
- tg_flush_bios(pd_to_tg(pd));
+ struct throtl_grp *tg = pd_to_tg(pd);
+ struct throtl_data *td = tg->td;
+ unsigned long flags;
+
+ spin_lock_irqsave(&td->lock, flags);
+ tg_flush_bios(tg);
+ spin_unlock_irqrestore(&td->lock, flags);
}
struct blkcg_policy blkcg_policy_throtl = {
@@ -1725,6 +1768,7 @@ static void tg_cancel_writeback_bios(struct throtl_grp *tg,
void blk_throtl_cancel_bios(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
+ struct throtl_data *td = q->td;
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
struct bio_list cancel_bios[2] = { };
@@ -1734,6 +1778,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
return;
spin_lock_irq(&q->queue_lock);
+ spin_lock(&td->lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
@@ -1752,6 +1797,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios);
}
rcu_read_unlock();
+ spin_unlock(&td->lock);
spin_unlock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++) {
@@ -1791,7 +1837,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
bool __blk_throtl_bio(struct bio *bio)
{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1801,7 +1846,7 @@ bool __blk_throtl_bio(struct bio *bio)
struct throtl_data *td = tg->td;
rcu_read_lock();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
sq = &tg->service_queue;
while (true) {
@@ -1877,7 +1922,7 @@ bool __blk_throtl_bio(struct bio *bio)
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
rcu_read_unlock();
return throttled;
@@ -1886,17 +1931,19 @@ bool __blk_throtl_bio(struct bio *bio)
void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
+ struct throtl_data *td = q->td;
/*
* blkg_destroy_all() already deactivate throtl policy, just check and
* free throtl data.
*/
- if (!q->td)
+ if (!td)
return;
- timer_delete_sync(&q->td->service_queue.pending_timer);
+ throtl_shutdown_timers(q);
throtl_shutdown_wq(q);
- kfree(q->td);
+ q->td = NULL;
+ kfree(td);
}
static int __init throtl_init(void)
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 05/17] block: add bio_alloc_atomic() for atomic bio users
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
Add bio_alloc_atomic() for callers that need a GFP_ATOMIC bio from the
default bio set but cannot safely pass a bdev during allocation. The
helper returns an unattached bio, leaving callers to set bi_bdev and
attach blkcg state explicitly before submission.
Use the helper for virtio-pmem flush child bios and OCFS2 heartbeat I/O.
Both allocate bios from atomic paths and must avoid creating missing blkgs
once blkg creation is protected by q->blkcg_mutex. virtio-pmem clones the
parent bio's blkg association; OCFS2 binds heartbeat I/O to the root blkg.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
drivers/nvdimm/nd_virtio.c | 8 ++++----
fs/ocfs2/cluster/heartbeat.c | 15 ++++++++++++---
include/linux/bio.h | 6 ++++++
3 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 4176046627be..13d1ed1c466c 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -115,13 +115,13 @@ int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
* parent bio. Otherwise directly call nd_region flush.
*/
if (bio && bio->bi_iter.bi_sector != -1) {
- struct bio *child = bio_alloc(bio->bi_bdev, 0,
- REQ_OP_WRITE | REQ_PREFLUSH,
- GFP_ATOMIC);
+ struct bio *child = bio_alloc_atomic(0,
+ REQ_OP_WRITE | REQ_PREFLUSH);
if (!child)
return -ENOMEM;
- bio_clone_blkg_association(child, bio);
+ child->bi_bdev = bio->bi_bdev;
+ bio_clone_blkg_association(child, bio);
child->bi_iter.bi_sector = -1;
bio_chain(child, bio);
submit_bio(child);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index d12784aaaa4b..ec70f3b62837 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -519,16 +520,24 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct bio *bio;
struct page *page;
- /* Testing has shown this allocation to take long enough under
+ /*
+ * Testing has shown this allocation to take long enough under
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
- * all together. */
- bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
+ * all together.
+ *
+ * Use the atomic bio allocation helper so bio_init() does not create a
+ * missing blkg. Heartbeat IO is cluster-liveness IO, so account it to
+ * the root blkcg instead.
+ */
+ bio = bio_alloc_atomic(16, opf);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
goto bail;
}
+ bio->bi_bdev = reg_bdev(reg);
+ bio_associate_blkg_from_css(bio, blkcg_root_css);
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8f33f717b14f..f7d94d37893f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -366,6 +366,12 @@ static inline struct bio *bio_alloc(struct block_device *bdev,
return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
}
+static inline struct bio *bio_alloc_atomic(unsigned short nr_vecs,
+ blk_opf_t opf)
+{
+ return bio_alloc_bioset(NULL, nr_vecs, opf, GFP_ATOMIC, &fs_bio_set);
+}
+
void submit_bio(struct bio *bio);
extern void bio_endio(struct bio *);
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 06/17] blk-cgroup: support non-blocking bio association
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
Allow bio association helpers to be called from non-blocking paths by
returning whether the association succeeded and by taking a nowait argument.
The normal callers pass nowait=false and keep the existing behavior of
creating missing blkgs.
For nowait=true, the helper only succeeds when the needed blkg already
exists. This lets callers set or clone a bio's bdev without entering the
sleepable missing-blkg creation path.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
block/bfq-cgroup.c | 5 ++--
block/bio.c | 6 ++---
block/blk-cgroup.c | 44 ++++++++++++++++++++++++---------
block/blk-crypto-fallback.c | 2 +-
drivers/md/bcache/request.c | 2 +-
drivers/md/dm.c | 2 +-
drivers/md/md.c | 2 +-
drivers/nvdimm/nd_virtio.c | 5 +++-
fs/gfs2/lops.c | 3 +--
fs/ocfs2/cluster/heartbeat.c | 2 +-
include/linux/bio.h | 47 ++++++++++++++++++++++++------------
include/linux/writeback.h | 2 +-
mm/page_io.c | 2 +-
13 files changed, 82 insertions(+), 42 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index e82ff03bda02..5c2faf56c8ef 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -616,13 +616,14 @@ struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio)
}
bfqg = blkg_to_bfqg(blkg);
if (bfqg->pd.online) {
- bio_associate_blkg_from_css(bio, &blkg->blkcg->css);
+ bio_associate_blkg_from_css(bio, &blkg->blkcg->css, false);
return bfqg;
}
blkg = blkg->parent;
}
bio_associate_blkg_from_css(bio,
- &bfqg_to_blkg(bfqd->root_group)->blkcg->css);
+ &bfqg_to_blkg(bfqd->root_group)->blkcg->css,
+ false);
return bfqd->root_group;
}
diff --git a/block/bio.c b/block/bio.c
index f2a5f4d0a967..b74e9961c8ee 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -236,7 +236,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_blkg = NULL;
bio->issue_time_ns = 0;
if (bdev)
- bio_associate_blkg(bio);
+ bio_associate_blkg(bio, false);
#ifdef CONFIG_BLK_CGROUP_IOCOST
bio->bi_iocost_cost = 0;
#endif
@@ -281,7 +281,7 @@ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
bio->bi_io_vec = bv;
bio->bi_bdev = bdev;
if (bio->bi_bdev)
- bio_associate_blkg(bio);
+ bio_associate_blkg(bio, false);
bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);
@@ -857,7 +857,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
if (bio->bi_bdev == bio_src->bi_bdev &&
bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio_clone_blkg_association(bio, bio_src);
+ bio_clone_blkg_association(bio, bio_src, false);
}
if (bio_crypt_clone(bio, bio_src, gfp) < 0)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d2a1f5903f24..92846094043a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -2068,7 +2068,7 @@ static inline struct blkcg_gq *blkg_lookup_tryget(struct blkcg_gq *blkg)
* up taking a reference on or %NULL if no reference was taken.
*/
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
- struct cgroup_subsys_state *css)
+ struct cgroup_subsys_state *css, bool nowait)
{
struct request_queue *q = bio->bi_bdev->bd_queue;
struct blkcg *blkcg = css_to_blkcg(css);
@@ -2110,18 +2110,30 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
* A reference will be taken on the blkg and will be released when @bio is
* freed.
*/
-void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css)
+bool bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css, bool nowait)
{
- if (bio->bi_blkg)
+ struct blkcg_gq *blkg;
+
+ if (!nowait)
+ might_sleep();
+
+ if (bio->bi_blkg) {
blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
if (css && css->parent) {
- bio->bi_blkg = blkg_tryget_closest(bio, css);
+ blkg = blkg_tryget_closest(bio, css, nowait);
+ if (!blkg)
+ return false;
+ bio->bi_blkg = blkg;
} else {
blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
}
+
+ return true;
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
@@ -2134,16 +2146,19 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
* already associated, the css is reused and association redone as the
* request_queue may have changed.
*/
-void bio_associate_blkg(struct bio *bio)
+bool bio_associate_blkg(struct bio *bio, bool nowait)
{
struct cgroup_subsys_state *css;
+ bool ret;
if (blk_op_is_passthrough(bio->bi_opf))
- return;
+ return true;
+ if (!bio->bi_bdev)
+ return true;
if (bio->bi_blkg) {
css = bio_blkcg_css(bio);
- bio_associate_blkg_from_css(bio, css);
+ return bio_associate_blkg_from_css(bio, css, nowait);
} else {
rcu_read_lock();
css = blkcg_css();
@@ -2151,9 +2166,10 @@ void bio_associate_blkg(struct bio *bio)
css = NULL;
rcu_read_unlock();
- bio_associate_blkg_from_css(bio, css);
+ ret = bio_associate_blkg_from_css(bio, css, nowait);
if (css)
css_put(css);
+ return ret;
}
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);
@@ -2163,10 +2179,14 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg);
* @dst: destination bio
* @src: source bio
*/
-void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+bool bio_clone_blkg_association(struct bio *dst, struct bio *src, bool nowait)
{
- if (src->bi_blkg)
- bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
+ if (!src->bi_blkg)
+ return true;
+ if (!dst->bi_bdev)
+ return false;
+
+ return bio_associate_blkg_from_css(dst, bio_blkcg_css(src), nowait);
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 2a5c52ab74b4..b99470bee8b6 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -187,7 +187,7 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio_clone_blkg_association(bio, bio_src);
+ bio_clone_blkg_association(bio, bio_src, false);
/*
* Move page array up in the allocated memory for the bio vecs as far as
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3fa3b13a410f..c2b7a694ea99 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -848,7 +848,7 @@ static CLOSURE_CALLBACK(cached_dev_read_done)
s->iop.bio->bi_iter.bi_sector =
s->cache_miss->bi_iter.bi_sector;
s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
- bio_clone_blkg_association(s->iop.bio, s->cache_miss);
+ bio_clone_blkg_association(s->iop.bio, s->cache_miss, false);
bch_bio_map(s->iop.bio, NULL);
bio_copy_data(s->cache_miss, s->iop.bio);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7287bed6eb64..c54636235ffe 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1364,7 +1364,7 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
if (!tgt_clone)
tgt_clone = clone;
- bio_clone_blkg_association(tgt_clone, io->orig_bio);
+ bio_clone_blkg_association(tgt_clone, io->orig_bio, false);
/*
* Account io->origin_bio to DM dev on behalf of target
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d1465bcd86c8..d63c8841aaad 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9355,7 +9355,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
return;
bio_chain(discard_bio, bio);
- bio_clone_blkg_association(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio, false);
mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
submit_bio_noacct(discard_bio);
}
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 13d1ed1c466c..0391b41a4fce 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -121,7 +121,10 @@ int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
if (!child)
return -ENOMEM;
child->bi_bdev = bio->bi_bdev;
- bio_clone_blkg_association(child, bio);
+ if (!bio_clone_blkg_association(child, bio, true)) {
+ bio_put(child);
+ return -ENOMEM;
+ }
child->bi_iter.bi_sector = -1;
bio_chain(child, bio);
submit_bio(child);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6dabe73ad790..ac45ccbde2a9 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -484,7 +484,7 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs,
struct bio *new;
new = bio_alloc(prev->bi_bdev, nr_iovecs, opf, GFP_NOIO);
- bio_clone_blkg_association(new, prev);
+ bio_clone_blkg_association(new, prev, false);
new->bi_iter.bi_sector = sector;
bio_chain(new, prev);
submit_bio(prev);
@@ -1114,4 +1114,3 @@ const struct gfs2_log_operations *gfs2_log_ops[] = {
&gfs2_revoke_lops,
NULL,
};
-
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ec70f3b62837..eb7f30707092 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -537,7 +537,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
goto bail;
}
bio->bi_bdev = reg_bdev(reg);
- bio_associate_blkg_from_css(bio, blkcg_root_css);
+ bio_associate_blkg_from_css(bio, blkcg_root_css, true);
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f7d94d37893f..026df09a2546 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -508,19 +508,39 @@ static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
#define bio_dev(bio) \
disk_devt((bio)->bi_bdev->bd_disk)
+static inline void bio_set_dev_no_blkg(struct bio *bio,
+ struct block_device *bdev)
+{
+ bio_clear_flag(bio, BIO_REMAPPED);
+ if (bio->bi_bdev != bdev)
+ bio_clear_flag(bio, BIO_BPS_THROTTLED);
+ bio->bi_bdev = bdev;
+}
+
#ifdef CONFIG_BLK_CGROUP
-void bio_associate_blkg(struct bio *bio);
-void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css);
-void bio_clone_blkg_association(struct bio *dst, struct bio *src);
+bool bio_associate_blkg(struct bio *bio, bool nowait);
+bool bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css,
+ bool nowait);
+bool bio_clone_blkg_association(struct bio *dst, struct bio *src,
+ bool nowait);
void blkcg_punt_bio_submit(struct bio *bio);
#else /* CONFIG_BLK_CGROUP */
-static inline void bio_associate_blkg(struct bio *bio) { }
-static inline void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css)
-{ }
-static inline void bio_clone_blkg_association(struct bio *dst,
- struct bio *src) { }
+static inline bool bio_associate_blkg(struct bio *bio, bool nowait)
+{
+ return true;
+}
+static inline bool bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css,
+ bool nowait)
+{
+ return true;
+}
+static inline bool bio_clone_blkg_association(struct bio *dst,
+ struct bio *src, bool nowait)
+{
+ return true;
+}
static inline void blkcg_punt_bio_submit(struct bio *bio)
{
submit_bio(bio);
@@ -529,11 +549,8 @@ static inline void blkcg_punt_bio_submit(struct bio *bio)
static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
{
- bio_clear_flag(bio, BIO_REMAPPED);
- if (bio->bi_bdev != bdev)
- bio_clear_flag(bio, BIO_BPS_THROTTLED);
- bio->bi_bdev = bdev;
- bio_associate_blkg(bio);
+ bio_set_dev_no_blkg(bio, bdev);
+ bio_associate_blkg(bio, false);
}
/*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 62552a2ce5b9..8165536fbbb0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -262,7 +262,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
* regular writeback instead of writing things out itself.
*/
if (wbc->wb)
- bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
+ bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css, false);
}
void inode_switch_wbs_work_fn(struct work_struct *work);
diff --git a/mm/page_io.c b/mm/page_io.c
index c96d3e4cf872..48404f8604cb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -321,7 +321,7 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
css = NULL;
rcu_read_unlock();
- bio_associate_blkg_from_css(bio, css);
+ bio_associate_blkg_from_css(bio, css, false);
if (css)
css_put(css);
}
--
2.51.0
^ permalink raw reply related
* [RFC PATCH v1 07/17] block: support non-blocking bio allocation with a bdev
From: Yu Kuai @ 2026-07-04 19:51 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo
Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, Alasdair Kergon,
Benjamin Marzinski, Mike Snitzer, Mikulas Patocka, Dongsheng Yang,
Zheng Gu, Coly Li, Kent Overstreet, Josef Bacik, Yu Kuai,
Nilay Shroff, linux-block, cgroups, linux-nvme, dm-devel,
linux-bcache
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
bio_alloc_clone(), bio_init_clone(), and bio_alloc_bioset() can be called
with non-blocking GFP masks. Passing a bdev into bio initialization may
need to associate blkcg state and, after missing blkg creation is serialized
by q->blkcg_mutex, that association can sleep.
Keep the generic block layer simple by letting bio_alloc_bioset() handle this
case directly. Non-blocking allocations initialize the bio without a bdev,
set the bdev fields, and associate the blkg with nowait=true. If the needed
blkg is missing and would have to be created, allocation fails normally so the
caller can retry from a blocking context.
Blocking callers keep the existing allocation-time association behavior.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
block/bio.c | 46 ++++++++++++++++++++++++++++++++++++++++------
1 file changed, 40 insertions(+), 6 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index b74e9961c8ee..863ae73a4222 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -259,6 +259,20 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
}
EXPORT_SYMBOL(bio_init);
+static bool bio_init_nowait(struct bio *bio, struct block_device *bdev,
+ struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf)
+{
+ bio_init(bio, NULL, table, max_vecs, opf);
+ if (bdev) {
+ bio_set_dev_no_blkg(bio, bdev);
+ if (bio_associate_blkg(bio, true))
+ return true;
+ bio_uninit(bio);
+ return false;
+ }
+ return true;
+}
+
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
@@ -599,12 +613,25 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
}
}
- if (nr_vecs && nr_vecs <= BIO_INLINE_VECS)
- bio_init_inline(bio, bdev, nr_vecs, opf);
- else
- bio_init(bio, bdev, bvecs, nr_vecs, opf);
+ if (nr_vecs && nr_vecs <= BIO_INLINE_VECS) {
+ bvecs = bio_inline_vecs(bio);
+ if (gfpflags_allow_blocking(saved_gfp))
+ bio_init(bio, bdev, bvecs, nr_vecs, opf);
+ else if (!bio_init_nowait(bio, bdev, bvecs, nr_vecs, opf))
+ goto fail_free_bio;
+ } else {
+ if (gfpflags_allow_blocking(saved_gfp))
+ bio_init(bio, bdev, bvecs, nr_vecs, opf);
+ else if (!bio_init_nowait(bio, bdev, bvecs, nr_vecs, opf))
+ goto fail_free_bio;
+ }
bio->bi_pool = bs;
return bio;
+
+fail_free_bio:
+ bio->bi_pool = bs;
+ bio_put(bio);
+ return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);
@@ -857,7 +884,9 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
if (bio->bi_bdev == bio_src->bi_bdev &&
bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio_clone_blkg_association(bio, bio_src, false);
+ if (!bio_clone_blkg_association(bio, bio_src,
+ !gfpflags_allow_blocking(gfp)))
+ return -ENOMEM;
}
if (bio_crypt_clone(bio, bio_src, gfp) < 0)
@@ -913,9 +942,14 @@ EXPORT_SYMBOL(bio_alloc_clone);
int bio_init_clone(struct block_device *bdev, struct bio *bio,
struct bio *bio_src, gfp_t gfp)
{
+ bool blocking = gfpflags_allow_blocking(gfp);
int ret;
- bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ if (blocking)
+ bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ else if (!bio_init_nowait(bio, bdev, bio_src->bi_io_vec, 0,
+ bio_src->bi_opf))
+ return -ENOMEM;
ret = __bio_clone(bio, bio_src, gfp);
if (ret)
bio_uninit(bio);
--
2.51.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox