Linux block layer
 help / color / mirror / Atom feed
* [PATCH RFC 3/6] block: split bdev_yield_claim() out of bdev_fput()
From: Christian Brauner @ 2026-06-15 13:18 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260615-work-super-freeze_deny_upstream-v1-0-a6c72b840e7d@kernel.org>

bdev_fput() yields the holder claim and then closes the file, which is a
deferred operation.  Split the yield half into bdev_yield_claim() so a caller
can give up the holder while the file - and therefore the block device - is
still open, act on the device, and only then bdev_fput().

A filesystem that made a device unfreezable for a membership change with
bdev_deny_freeze() undoes the deny on release with

	bdev_yield_claim(bdev_file);
	bdev_allow_freeze(file_bdev(bdev_file));
	bdev_fput(bdev_file);

Re-allowing only after the holder is yielded avoids stranding the filesystem
on a racing freeze, and doing it while the file is still open avoids touching
the block device after bdev_fput().  bdev_fput() yields again, which is a
no-op once the claim has already been given up.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 block/bdev.c           | 30 ++++++++++++++++++++++--------
 include/linux/blkdev.h |  1 +
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 939dec351772..e59052c2a081 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1199,18 +1199,16 @@ void bdev_release(struct file *bdev_file)
 }
 
 /**
- * bdev_fput - yield claim to the block device and put the file
+ * bdev_yield_claim - give up the holder claim on an open block device
  * @bdev_file: open block device
  *
- * Yield claim on the block device and put the file. Ensure that the
- * block device can be reclaimed before the file is closed which is a
- * deferred operation.
+ * Yield the holder and any write access for @bdev_file without closing it, so
+ * the caller can still act on the device - e.g. bdev_allow_freeze() it - before
+ * the final bdev_fput().  bdev_fput() yields too, so calling it afterwards is
+ * safe.
  */
-void bdev_fput(struct file *bdev_file)
+void bdev_yield_claim(struct file *bdev_file)
 {
-	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
-		return;
-
 	if (bdev_file->private_data) {
 		struct block_device *bdev = file_bdev(bdev_file);
 		struct gendisk *disk = bdev->bd_disk;
@@ -1226,7 +1224,23 @@ void bdev_fput(struct file *bdev_file)
 		bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
 		mutex_unlock(&disk->open_mutex);
 	}
+}
+EXPORT_SYMBOL_GPL(bdev_yield_claim);
+
+/**
+ * bdev_fput - yield claim to the block device and put the file
+ * @bdev_file: open block device
+ *
+ * Yield claim on the block device and put the file. Ensure that the
+ * block device can be reclaimed before the file is closed which is a
+ * deferred operation.
+ */
+void bdev_fput(struct file *bdev_file)
+{
+	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
+		return;
 
+	bdev_yield_claim(bdev_file);
 	fput(bdev_file);
 }
 EXPORT_SYMBOL(bdev_fput);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cf1951caadb2..9fc16e3c8075 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1832,6 +1832,7 @@ int bdev_thaw(struct block_device *bdev);
 int bdev_deny_freeze(struct block_device *bdev);
 void bdev_allow_freeze(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
+void bdev_yield_claim(struct file *bdev_file);
 
 struct io_comp_batch {
 	struct rq_list req_list;

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC 2/6] block: allow making a block device unfreezable
From: Christian Brauner @ 2026-06-15 13:18 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260615-work-super-freeze_deny_upstream-v1-0-a6c72b840e7d@kernel.org>

Add bdev_deny_freeze() and bdev_allow_freeze(), modeled on
deny_write_access()/allow_write_access().  bd_fsfreeze_count becomes a
signed counter: > 0 counts active freezes, < 0 counts deniers, and the
two regimes are mutually exclusive.  bdev_freeze() refuses with -EBUSY
while a deny is held, and bdev_deny_freeze() refuses while the device is
frozen.

A filesystem that mutates a device's membership (a btrfs device add,
remove or replace) denies freezing on the device for the duration, so a
claim a freeze walk might act on is never added or torn down behind the
freezer's back.

The deny/allow helpers are a single atomic on bd_fsfreeze_count and take
no lock, so they can be called while holding s_umount without inverting
against bdev_freeze()'s bd_fsfreeze_mutex -> s_umount order.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 block/bdev.c              | 43 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h |  2 +-
 include/linux/blkdev.h    |  2 ++
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index bb0ffa3bb4df..939dec351772 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -304,7 +304,12 @@ int bdev_freeze(struct block_device *bdev)
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 
-	if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+	/* A device being removed from its filesystem refuses freezes. */
+	if (!atomic_inc_unless_negative(&bdev->bd_fsfreeze_count)) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EBUSY;
+	}
+	if (atomic_read(&bdev->bd_fsfreeze_count) > 1) {
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return 0;
 	}
@@ -368,6 +373,42 @@ int bdev_thaw(struct block_device *bdev)
 }
 EXPORT_SYMBOL(bdev_thaw);
 
+/**
+ * bdev_deny_freeze - make a block device unfreezable
+ * @bdev: block device
+ *
+ * Reserve @bdev against bdev_freeze() the way deny_write_access() reserves a
+ * file against writers.  bd_fsfreeze_count is sign-encoded: > 0 counts active
+ * freezes, < 0 counts deniers, so a deny succeeds only while no freeze is in
+ * progress.  While held, bdev_freeze() returns -EBUSY.  Pair with
+ * bdev_allow_freeze().
+ *
+ * A filesystem removing, adding or replacing a member device denies freezes on
+ * it for the duration, so a claim a freeze walk might act on is never torn down
+ * behind the freezer's back.  The deny is device-scoped, not (device,
+ * superblock)-scoped: a device shared by several superblocks is refused for all
+ * of them.  No in-tree filesystem removes a shared claim from a live superblock.
+ *
+ * Return: 0, or -EBUSY if the device is currently frozen.
+ */
+int bdev_deny_freeze(struct block_device *bdev)
+{
+	return atomic_dec_unless_positive(&bdev->bd_fsfreeze_count) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(bdev_deny_freeze);
+
+/**
+ * bdev_allow_freeze - allow freezing a block device again
+ * @bdev: block device
+ *
+ * Undo one bdev_deny_freeze().
+ */
+void bdev_allow_freeze(struct block_device *bdev)
+{
+	atomic_inc(&bdev->bd_fsfreeze_count);
+}
+EXPORT_SYMBOL_GPL(bdev_allow_freeze);
+
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..5a725a0cd35f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,7 +66,7 @@ struct block_device {
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-	atomic_t		bd_fsfreeze_count; /* number of freeze requests */
+	atomic_t		bd_fsfreeze_count; /* >0 freeze requests, <0 freeze deniers */
 	struct mutex		bd_fsfreeze_mutex; /* serialize freeze/thaw */
 
 	struct partition_meta_info *bd_meta_info;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..cf1951caadb2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1829,6 +1829,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
 
 int bdev_freeze(struct block_device *bdev);
 int bdev_thaw(struct block_device *bdev);
+int bdev_deny_freeze(struct block_device *bdev);
+void bdev_allow_freeze(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
 
 struct io_comp_batch {

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC 1/6] btrfs: destroy the target device when mark_block_group_to_copy() fails
From: Christian Brauner @ 2026-06-15 13:18 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260615-work-super-freeze_deny_upstream-v1-0-a6c72b840e7d@kernel.org>

btrfs_dev_replace_start() opens the replacement target with
btrfs_init_dev_replace_tgtdev(), which adds it to the device list and opens
its block device.  Every error path after that point reaches the 'leave'
label to tear the target back down with btrfs_destroy_dev_replace_tgtdev() -
except the mark_block_group_to_copy() failure, which returns directly.  The
target is then leaked: it stays on the device list with its block device
held until the filesystem is unmounted.

Goto leave like the other post-open error paths so the target is destroyed.

Fixes: 78ce9fc269af ("btrfs: zoned: mark block groups to copy for device-replace")
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/dev-replace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8f8fa14886de..0112aa6d7ab1 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -624,7 +624,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 
 	ret = mark_block_group_to_copy(fs_info, src_device);
 	if (ret)
-		return ret;
+		goto leave;
 
 	down_write(&dev_replace->rwsem);
 	dev_replace->replace_task = current;

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC 0/6] block,btrfs: fix frozen-superblock strand on device add/remove/replace
From: Christian Brauner @ 2026-06-15 13:18 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)

A block-device freeze that races a btrfs device membership change can leave
the whole filesystem stuck frozen, recoverable only with a manual FITHAW.

btrfs holds each of its devices open with the superblock as the block-device
holder.  bdev_freeze() - issued by "dmsetup suspend", an LVM snapshot, or
FIFREEZE on the bare device - resolves that holder to freeze the filesystem,
and bdev_thaw() ("dmsetup resume") resolves it again to thaw.  If a freeze
lands while btrfs is adding, removing or replacing a device, it rides in on
the device's holder link and freezes the filesystem; the membership change
then drops that link, so the matching thaw can no longer find the
superblock.  The filesystem stays frozen with no way back short of FITHAW.

To reproduce on the remove path: build a two-device btrfs with one member
behind a dm-linear target, write enough data that removing that member
relocates for a few seconds, start "btrfs device remove" on it, and
"dmsetup suspend" the dm device while the removal is underway.  The suspend's
freeze blocks on the remove ioctl's write access and rides in as the ioctl
drops it; the removal then clears the device's holder link, so the matching
"dmsetup resume" can no longer reach the superblock.  On an unpatched kernel
the filesystem is left frozen and the next write hangs in D state until a
manual FITHAW (fsfreeze -u).

The fix lets a filesystem forbid freezing a device for the duration of a
membership change, modelled on deny_write_access()/allow_write_access().
bd_fsfreeze_count becomes signed: > 0 counts active freezes, < 0 counts deny
holders, and the two are mutually exclusive.  bdev_deny_freeze() reserves the
device (bdev_freeze() then returns -EBUSY) and bdev_allow_freeze() releases
it; both are a single lockless atomic, so a filesystem can deny under
s_umount without inverting against bdev_freeze()'s bd_fsfreeze_mutex.  btrfs
denies the device across each add, remove and replace, so a racing freeze is
refused instead of riding in, while a normal freeze of a settled member
still works.

To re-allow freezing safely on release, bdev_yield_claim() is split out of
bdev_fput(): the caller yields the holder while the device file is still
open, re-allows freezing on the now-holderless device, and only then closes
it.  Re-allowing after the holder is gone avoids re-stranding on a racing
freeze; doing it while the file is still open keeps the block device alive
without referencing it after the final fput.

Patch 1 is independent: it fixes a pre-existing target-device leak on the
mark_block_group_to_copy() failure path in btrfs_dev_replace_start().  The
replace bracket reuses that 'leave' unwind, so it comes first.

With the fix the racing suspend is refused with -EBUSY mid-removal and the
filesystem stays writable.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Christian Brauner (6):
      btrfs: destroy the target device when mark_block_group_to_copy() fails
      block: allow making a block device unfreezable
      block: split bdev_yield_claim() out of bdev_fput()
      btrfs: deny freezing a device while it is being removed
      btrfs: deny freezing a device while it is being added
      btrfs: deny freezing devices undergoing a replace

 block/bdev.c              | 73 ++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/dev-replace.c    | 67 +++++++++++++++++++++++++++++++++-----
 fs/btrfs/ioctl.c          |  4 +--
 fs/btrfs/volumes.c        | 83 ++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/volumes.h        |  6 +++-
 include/linux/blk_types.h |  2 +-
 include/linux/blkdev.h    |  3 ++
 7 files changed, 206 insertions(+), 32 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260615-work-super-freeze_deny_upstream-498ae64761a0


^ permalink raw reply

* [PATCH] block: remove redundant GD_NEED_PART_SCAN in add_disk_final()
From: Connor Williamson @ 2026-06-15 13:07 UTC (permalink / raw)
  To: axboe
  Cc: linux-block, linux-kernel, stable, yukuai3, hch, jack,
	nh-open-source, connordw

add_disk_final() sets GD_NEED_PART_SCAN before calling bdev_add(),
then calls disk_scan_partitions() which sets the flag itself. The
early set is redundant and introduces a race.

Between bdev_add() and disk_scan_partitions(), concurrent openers
(multipathd, blkid, LVM) see the flag in blkdev_get_whole() and
trigger bdev_disk_changed(). When disk_scan_partitions() then runs,
it calls bdev_disk_changed() again, dropping the partitions the
concurrent opener already created before re-adding them, which can
result in transient partition disappearances.

The race is observable by inserting an msleep() between bdev_add()
and disk_scan_partitions() while running concurrent open() calls
during device bind. Without artificial delay, it manifests under
scheduling pressure during boot on systems with aggressive device
scanners (multipathd, systemd-udevd).

Therefore, do not set GD_NEED_PART_SCAN in add_disk_final(). Other
GD_NEED_PART_SCAN consumers (blkdev_get_whole(),
sd_need_revalidate()) should not be affected as the flag
is set internally by disk_scan_partitions().

The retry-on-next-open intention from commit e5cfefa97bcc
("block: fix scan partition for exclusively open device again")
should also not be affected as the early return paths in
disk_scan_partitions() should be unreachable at device registration
time (bd_holder is NULL and open_partitions is zero).

Fixes: e5cfefa97bcc ("block: fix scan partition for exclusively open device again")
Cc: stable@vger.kernel.org
Signed-off-by: Connor Williamson <connordw@amazon.com>
---
 block/genhd.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e95..0da6cdf3d5fb0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -407,10 +407,6 @@ static void add_disk_final(struct gendisk *disk)
 	struct device *ddev = disk_to_dev(disk);

 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
-		/* Make sure the first partition scan will be proceed */
-		if (get_capacity(disk) && disk_has_partscan(disk))
-			set_bit(GD_NEED_PART_SCAN, &disk->state);
-
 		bdev_add(disk->part0, ddev->devt);
 		if (get_capacity(disk))
 			disk_scan_partitions(disk, BLK_OPEN_READ);
--
2.47.3


^ permalink raw reply related

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Zdenek Kabelac @ 2026-06-15 13:07 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, linux-block, dm-devel
In-Reply-To: <ai7rnH20IYeSmY8s@gallifrey>

Dne 14. 06. 26 v 19:57 Dr. David Alan Gilbert napsal(a):
> Hi,
>    I've got a repeatable raid hang/warn and would appreciate some pointers
> as where to debug.
>    (I've been logging stuff on  https://bugzilla.kernel.org/show_bug.cgi?id=221535 )
> 
>    This started off as debugging a case where I'd get my RAID1 (on the host)
> getting a reliable 'rescheduling sector'/disk failure while running the qemu block test suite
> during a qemu build, but then I tried to build a smaller discrete
> test, and now I've got a simply triggerable warn and test hang.
> There's no errors from the underlying SATA layer on the storage,
> everything resyncs just fine.
> 
> I've got an existing LVM vg ('main') with two mirrors on sda2, and sdb2
> which are SATA disks.
> 
> # lvcreate --type mirror --mirrors 1 -L 1G main /dev/sda2 /dev/sdb2
>

Hi

It's probably worth to say here - the '--type mirror' is the OLD (historical) 
DM mirror target implementation - this target is now in the not so active 
development as users are supposed to be using newer (and faster) md wrapped 
'--type raid1'

So if you use   'lvcreate -m1 ....'   you get 'auto-magically' this mirroring 
target.

But this obviously doesn't fix the problem if old mirror target...

Regards

Zdenek


^ permalink raw reply

* Re: [PATCH v5 5/9] block: implement NVMEM provider
From: Bartosz Golaszewski @ 2026-06-15 13:06 UTC (permalink / raw)
  To: Loic Poulain
  Cc: Bartosz Golaszewski, linux-mmc, devicetree, linux-kernel,
	linux-arm-msm, linux-block, linux-wireless, ath10k,
	linux-bluetooth, netdev, daniel, Ulf Hansson, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson, Konrad Dybcio,
	Jens Axboe, Johannes Berg, Jeff Johnson, Marcel Holtmann,
	Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
	Russell King, Saravana Kannan
In-Reply-To: <CAFEp6-0oxBEdfH-fqhdM18pt4JewLwrMOON9qpQgLFh8KS0hDg@mail.gmail.com>

On Mon, 15 Jun 2026 11:33:22 +0200, Loic Poulain
<loic.poulain@oss.qualcomm.com> said:
> On Mon, Jun 15, 2026 at 11:28 AM Loic Poulain
> <loic.poulain@oss.qualcomm.com> wrote:
>>
>
> Also we cannot safely return -EPROBE_DEFER from add_disk_final()
> either. The NVMEM registration point is late in the sequence, too much
> has already happened to easily unwind. The easiest is that the NVMEM
> simply won't be available if registration fails, which looks
> acceptable?
>

I'd argue that it's a problem with subsystem code then as unwinding should
work fine no matter the point in the sequence when it's initiated but I guess
this isn't really an issue in your patches.

I suppose we shouldn't typically run into probe deferral here so I'm fine just
ignoring the return value.

Bart

^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-15 12:50 UTC (permalink / raw)
  To: Thorsten Leemhuis, kbusch, vklimovs, trnka
  Cc: linux-block, dm-devel, Linux kernel regressions list
In-Reply-To: <165d3195-c81d-4760-870b-23a9a3b3b72c@leemhuis.info>

* Thorsten Leemhuis (regressions@leemhuis.info) wrote:
> On 6/14/26 19:57, Dr. David Alan Gilbert wrote:
> >
> >   I've got a repeatable raid hang/warn and would appreciate some pointers
> > as where to debug.
> >   (I've been logging stuff on  https://bugzilla.kernel.org/show_bug.cgi?id=221535 )
> 
> Note: not my area of expertise, so I might be sending you totally
> off-track with this comment. Feel free to ignore it. But FWIW:

Hi Thorsten,
  Thanks for the reply - these do seem to be related!
(So copying in Keith, Vjaceslavs, and Tomáš )
(Not my area either).

> Have you seen these reports?
> https://lore.kernel.org/all/2982107.4sosBPzcNG@electra/
> https://lore.kernel.org/all/CAC_j7i1R7oy+nRhxEjCTba=DUgn02w9X+p94DCu0aHv5+5tKnQ@mail.gmail.com/

I hadn't!  Those are both the problem I originally was trying to debug
and stumbled into the WARN/BUG/hang with my test program.

> The former lead to a fix in the mdraid code that should be in the kernel
> version you are using. But in a reply to the latter report the repoter
> claimed that that fix is not enough (claiming "this was obvious" and
> also using dm), but things then stalled there.

Yeh I see my world has Keith's f7b24c7b41f23

I think the problem I'm seeing is zero length requests coming from somewhere.

The WARN I'm seeing in 7.1.0-rc7+ is:

[ 2681.597042] device-mapper: raid1: Mirror read failed from 252:25. Trying alternative device.
[ 2681.631933] ------------[ cut here ]------------
[ 2681.631939] WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#22: kworker/22:0/18929

1039 int bio_add_page(struct bio *bio, struct page *page,
1040                  unsigned int len, unsigned int offset)
1041 {
1042         if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
1043                 return 0;
1044         if (WARN_ON_ONCE(len == 0))
1045                 return 0;

So it's the ' if (WARN_ON_ONCE(len == 0))'

and the warn I got on the older 7.0.8 was:
[Sun May 17 17:22:52 2026] WARNING: drivers/scsi/scsi_lib.c:1140 at scsi_alloc_sgtables+0x38a/0x400, CPU#28: kworker/28:1H/3943

which I *think* corresponds to:
1164         if (WARN_ON_ONCE(!nr_segs))
1165                 return BLK_STS_IOERR;

so it sounds like we need to find where zero length requests are coming from??

Thanks again,

Dave

> Ciao, Thorsten
> 
> >   This started off as debugging a case where I'd get my RAID1 (on the host)
> > getting a reliable 'rescheduling sector'/disk failure while running the qemu block test suite
> > during a qemu build, but then I tried to build a smaller discrete
> > test, and now I've got a simply triggerable warn and test hang.
> > There's no errors from the underlying SATA layer on the storage,
> > everything resyncs just fine.
> > 
> > I've got an existing LVM vg ('main') with two mirrors on sda2, and sdb2
> > which are SATA disks.
> > 
> > # lvcreate --type mirror --mirrors 1 -L 1G main /dev/sda2 /dev/sdb2
> > # mkfs.ext4 /dev/mapper/main-lvol0
> > # mount /dev/mapper/main-lvol0 /mnt/tmp/
> > # chmod a+rwx /mnt/tmp
> > 
> > $ dd if=/dev/zero of=/mnt/tmp/testfile bs=1024k count=1
> > 
> > (I then wait for the IO to stop)
> > 
> > then we've got this little test program:
> > 
> > <--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->
> > #include <errno.h>
> > #include <fcntl.h>             
> > #include <asm-generic/fcntl.h>
> > #include <stdio.h> 
> > #include <unistd.h>
> > 
> > 
> > const char* path="/mnt/tmp/testfile";
> > static char buf[8192];
> > 
> > int main()                                       
> > {
> >   int fd=open(path, O_RDWR|O_DIRECT|O_CLOEXEC);
> >     
> >   errno=0;
> >   int res3=pread(fd, buf, 4096, 0);
> >   printf("pread of 4096 said: %d (%m)\n", res3);
> > 
> > }
> > <--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->
> > 
> > running that, either hangs or gets a 'pread of 4096 said: -1 (Input/output error)'
> > when it hangs it's unkillable.
> > 
> > at the moment (on 7.1.0-rc7) this is giving:
> > Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> > Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> > Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
> > Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369
> > 
> > (full backtrace below)
> > (Note there is a moan in there about sdb IO error - repeated a lot - but
> > again, there's no SATA level errors, and the drive is fine on smart, and
> > I can read the whole of the underlying lvm mirrors, so I don't think it's
> > physically there).
> > 
> > I did a blktrace, although that gives me a 23G blkparse output, hmm
> > (I see each event repeated a lot - maybe per thread?)
> > 
> > 252,26  15        1     0.000000000  3435  Q  RS 264192 + 8 [dbf]
> >   252,26 is /dev/mapper/main-lvol0
> > 252,24  15        1     0.000005501  3435  A  RS 264192 + 8 <- (252,26) 264192
> >   252,24 is main-lvol0_mimage_0
> > 252,24  15        2     0.000005761  3435  Q  RS 264192 + 8 [dbf]
> >   8,0   15        1     0.000008646  3435  A  RS 71634944 + 8 <- (252,24) 264192
> >     so that's sda 
> >   8,0   15        2     0.000008787  3435  A  RS 73734144 + 8 <- (8,2) 71634944
> >     I guess mapping down from sda2 to sda
> >   8,0   15        3     0.000009037  3435  Q  RS 73734144 + 8 [dbf]
> >   8,0   15        4     0.000009809  3435  C  RS 73734144 + 8 [65514]
> >       ??? Hmm what's the 65514 there?
> > 252,24  15        3     0.000010320  3435  C  RS 264192 + 8 [65514]
> > 252,25  15        1     0.000290384   369  Q   R 264192 + 8 [kworker/15:1]
> >    252,25 is main-lvol0_mimage_1
> > 
> > and at this point I'm a bit lost as to what I'm looking for.
> > 
> > Hints appreciated!
> > 
> > (I don't believe this is a regression - or at least not recent)
> > 
> > Dave
> > 
> > 
> > 
> > 
> > Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> > Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> > Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
> > Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369
> > Jun 14 18:08:32 dalek dmeventd[1010]: main-lvol0 is now in-sync.
> > Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
> > Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
> > Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Not tainted 7.1.0-rc7+ #786 PREEMPT(lazy) 
> > Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
> > Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
> > Jun 14 18:08:32 dalek kernel: RIP: 0010:bio_add_page+0x18b/0x250
> > Jun 14 18:08:32 dalek kernel: Code: 24 10 4c 8b 04 24 84 c0 0f 85 c9 00 00 00 41 0f b7 40 78 48 8b 74 24 08 8b 4c 24 14 e9 b4 fe ff ff 0f 0b 31 c0 e9 55 d1 af 00 <0f> 0b eb f5 48 8b 7f 08 83 7f 60 05 0f 85 00 ff ff ff 49 8b 3b 4c
> > Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176fc10 EFLAGS: 00010246
> > Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffffd1fb8176fd18 RCX: 0000000000000000
> > Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8d1a8eb28b00
> > Jun 14 18:08:32 dalek kernel: RBP: 0000000000000000 R08: ffffd1fb8176fc38 R09: ffffd1fb8176fc40
> > Jun 14 18:08:32 dalek kernel: R10: ffffd1fb8176fc34 R11: 0000000000000000 R12: 0000000000000000
> > Jun 14 18:08:32 dalek kernel: R13: ffffd1fb8176fd90 R14: 0000000000000001 R15: ffff8d1a8eb28b00
> > Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
> > Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
> > Jun 14 18:08:32 dalek kernel: Call Trace:
> > Jun 14 18:08:32 dalek kernel:  <TASK>
> > Jun 14 18:08:32 dalek kernel:  do_region+0x227/0x2a0
> > Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
> > Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
> > Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
> > Jun 14 18:08:32 dalek kernel:  </TASK>
> > Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
> > Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> > Jun 14 18:08:32 dalek kernel: WARNING: drivers/scsi/scsi_lib.c:1164 at scsi_alloc_sgtables+0x38a/0x400, CPU#15: kworker/15:1/369
> > Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
> > Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
> > Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Tainted: G        W           7.1.0-rc7+ #786 PREEMPT(lazy) 
> > Jun 14 18:08:32 dalek kernel: Tainted: [W]=WARN
> > Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
> > Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
> > Jun 14 18:08:32 dalek kernel: RIP: 0010:scsi_alloc_sgtables+0x38a/0x400
> > Jun 14 18:08:32 dalek kernel: Code: 8b 3d ba 2d a9 01 e9 d1 fd ff ff 48 8b 75 00 48 8d bb f0 fe ff ff e8 15 b7 b0 ff 48 89 ab e0 00 00 00 89 45 08 e9 30 ff ff ff <0f> 0b 4c 8b 6c 24 30 b8 0a 00 00 00 e9 21 ff ff ff b8 09 00 00 00
> > Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176f7f0 EFLAGS: 00010246
> > Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffff8d1aedad0110 RCX: 0000000000000009
> > Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: ffffffff99c15960 RDI: ffff8d1aedad0110
> > Jun 14 18:08:32 dalek kernel: RBP: ffff8d1a93d17000 R08: ffff8d1aedad0110 R09: ffff8d1a818fa800
> > Jun 14 18:08:32 dalek kernel: R10: 7020676e69736961 R11: 0000000000000000 R12: 0000000000000000
> > Jun 14 18:08:32 dalek kernel: R13: 0000000000000000 R14: ffff8d1a93394000 R15: ffff8d1a93d17000
> > Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
> > Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
> > Jun 14 18:08:32 dalek kernel: Call Trace:
> > Jun 14 18:08:32 dalek kernel:  <TASK>
> > Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> > Jun 14 18:08:32 dalek kernel:  sd_setup_read_write_cmnd+0x9d/0x740
> > Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> > Jun 14 18:08:32 dalek kernel:  scsi_queue_rq+0x4d2/0x890
> > Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_rq_list+0x241/0x530
> > Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> > Jun 14 18:08:32 dalek kernel:  ? sbitmap_get+0x61/0x100
> > Jun 14 18:08:32 dalek kernel:  __blk_mq_do_dispatch_sched+0x330/0x340
> > Jun 14 18:08:32 dalek kernel:  __blk_mq_sched_dispatch_requests+0x143/0x180
> > Jun 14 18:08:32 dalek kernel:  blk_mq_sched_dispatch_requests+0x2d/0x70
> > Jun 14 18:08:32 dalek kernel:  blk_mq_run_hw_queue+0x2bf/0x350
> > Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> > Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_list+0x172/0x350
> > Jun 14 18:08:32 dalek kernel:  blk_mq_flush_plug_list+0x51/0x1a0
> > Jun 14 18:08:32 dalek kernel:  ? blk_mq_submit_bio+0x71c/0x9f0
> > Jun 14 18:08:32 dalek kernel:  __blk_flush_plug+0x112/0x180
> > Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> > Jun 14 18:08:32 dalek kernel:  __submit_bio+0x19c/0x260
> > Jun 14 18:08:32 dalek kernel:  __submit_bio_noacct+0x8e/0x210
> > Jun 14 18:08:32 dalek kernel:  do_region+0x14c/0x2a0
> > Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
> > Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
> > Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
> > Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> > Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
> > Jun 14 18:08:32 dalek kernel:  </TASK>
> > Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > Jun 14 18:08:37 dalek kernel: blk_print_req_error: 241000 callbacks suppressed
> > Jun 14 18:08:37 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> > 
> > 
> 
-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* Re: [PATCH 1/3] mm/page_io: let block drivers register custom swap I/O ops
From: Jianyue Wu @ 2026-06-15 12:49 UTC (permalink / raw)
  To: YoungJun Park
  Cc: Andrew Morton, Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham,
	Barry Song, Kairui Song, Kemeng Shi, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc
In-Reply-To: <ai9abo7GwMl+g43G@yjaykim-PowerEdge-T330>

On 6/15/2026 9:50 AM, YoungJun Park wrote:
> On Sun, Jun 14, 2026 at 11:35:29PM +0800, Jianyue Wu wrote:
>
> ...
>
> Hello Jianyue.
>
> Currently, the patch commit log indicates only a single custom swap
> registration is supported. Shouldn't we allow multiple block drivers to
> register their custom ops simultaneously from the beginning?
>
>> int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
>> struct list_head *folio_list);
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 284eebc40a70..ebdc96092961 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -2849,6 +2849,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
>> sis->ops = &swap_bdev_ops;
>>
>> if (S_ISBLK(inode->i_mode)) {
>> + const struct swap_ops *block_ops = lookup_swap_block_ops(sis);
>
> Also, just a personal thought on this part.
>
> Instead of using `block_device_fops` as a lookup key, what if we handle
> this similarly to how filesystems use the `a_ops->swap_activate` callback?
>
> We could add a `swap_activate` callback directly into
> struct block_device_operations (zram's zram_devops). This way, the
> block device itself can set up and replace the swap `ops` directly without
> needing a separate registration/lookup mechanism.
>
> What are your thoughts on this approach?
>
> Thanks,
> Youngjun Park
>

Hello Youngjun,

On multiple registrations:
Previously I was also a bit hesitate about this. Exactly, better to
support multiple block driver directly, I'll update it.

On swap_activate:
That's a very good idea, to use swap_activate callback, it is much
cleaner, I like this approach:) setup_swap_extents() would call it for
S_ISBLK swap targets, and the driver would install sis->ops at swapon
time. When the callback is NULL, the core can fall back to
swap_bdev_activate() and swap_bdev_ops. That removes the separate global
registration/lookup mechanism entirely, and multiple block drivers are
supported naturally because each device carries its own ops table.

Thanks,
Jianyue

^ permalink raw reply

* [PATCH V2] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
From: Zizhi Wo @ 2026-06-15 11:55 UTC (permalink / raw)
  To: axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, yukuai, houtao1, wozizhi

From: Zizhi Wo <wozizhi@huawei.com>

[BUG]
Our fuzz testing triggered a blkcg use-after-free issue:

  BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
  Call Trace:
  ...
  blkcg_deactivate_policy+0x244/0x4d0
  ioc_rqos_exit+0x44/0xe0
  rq_qos_exit+0xba/0x120
  __del_gendisk+0x50b/0x800
  del_gendisk+0xff/0x190
  ...

[CAUSE]
process1						process2
cgroup_rmdir
...
  css_killed_work_fn
    offline_css
    ...
      blkcg_destroy_blkgs
      ...
        __blkg_release
	  css_put(&blkg->blkcg->css)
          blkg_free
	    INIT_WORK(xxx, blkg_free_workfn)
	    schedule_work
    css_put
    ...
      blkcg_css_free
        kfree(blkcg)--------blkcg has been freed!!!
====================================schedule_work
              blkg_free_workfn
							__del_gendisk
							  rq_qos_exit
							    ioc_rqos_exit
							      blkcg_deactivate_policy
							        mutex_lock(&q->blkcg_mutex)
								spin_lock_irq(&q->queue_lock)
							        list_for_each_entry(blkg, xxx)
								  blkcg = blkg->blkcg
								  spin_lock(&blkcg->lock)-------UAF!!!
	        mutex_lock(&q->blkcg_mutex)
	        spin_lock_irq(&q->queue_lock)
	        /* Only then is the blkg removed from the list */
	        list_del_init(&blkg->q_node)

As a result, a blkg can still be reachable through q->blkg_list while
its ->blkcg has already been freed.

[Fix]
Fix this by deferring the blkcg css_put() until after the blkg has been
unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the
blkcg outlives every blkg still reachable through q->blkg_list, so any
iterator holding q->queue_lock is guaranteed to observe a valid
blkg->blkcg.

While at it, move css_tryget_online() from blkg_create() into blkg_alloc()
so that the css reference is owned by the alloc/free pair rather than
straddling layers:
blkg_alloc()  <-> blkg_free()
blkg_create() <-> blkg_destroy()

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Suggested-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
v2:
 - Move css_tryget_online() from blkg_create() into blkg_alloc() so the
   css reference follows the blkg's own lifetime, making the put in
   blkg_free_workfn() symmetric with the get in blkg_alloc().

v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/

 block/blk-cgroup.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc63bd220865..27414c291e49 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work)
 	if (blkg->parent)
 		blkg_put(blkg->parent);
 	spin_lock_irq(&q->queue_lock);
 	list_del_init(&blkg->q_node);
 	spin_unlock_irq(&q->queue_lock);
+	/*
+	 * Release blkcg css ref only after blkg is removed from q->blkg_list,
+	 * so concurrent iterators won't see a blkg with a freed blkcg.
+	 */
+	css_put(&blkg->blkcg->css);
 	mutex_unlock(&q->blkcg_mutex);
 
 	blk_put_queue(q);
 	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
@@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu)
 	 * blkg_stat_lock is for serializing blkg stat update
 	 */
 	for_each_possible_cpu(cpu)
 		__blkcg_rstat_flush(blkcg, cpu);
 
-	/* release the blkcg and parent blkg refs this blkg has been holding */
-	css_put(&blkg->blkcg->css);
 	blkg_free(blkg);
 }
 
 /*
  * A group is RCU protected, but having an rcu lock does not mean that one
@@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
 	if (!blkg->iostat_cpu)
 		goto out_exit_refcnt;
 	if (!blk_get_queue(disk->queue))
 		goto out_free_iostat;
+	/* blkg holds a reference to blkcg */
+	if (!css_tryget_online(&blkcg->css))
+		goto out_put_queue;
 
 	blkg->q = disk->queue;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
 	blkg->iostat.blkg = blkg;
@@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 
 out_free_pds:
 	while (--i >= 0)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+	css_put(&blkcg->css);
+out_put_queue:
 	blk_put_queue(disk->queue);
 out_free_iostat:
 	free_percpu(blkg->iostat_cpu);
 out_exit_refcnt:
 	percpu_ref_exit(&blkg->refcnt);
@@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 	if (blk_queue_dying(disk->queue)) {
 		ret = -ENODEV;
 		goto err_free_blkg;
 	}
 
-	/* blkg holds a reference to blkcg */
-	if (!css_tryget_online(&blkcg->css)) {
-		ret = -ENODEV;
-		goto err_free_blkg;
-	}
-
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 	}
 	blkg = new_blkg;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put_css;
+			goto err_free_blkg;
 		}
 		blkg_get(blkg->parent);
 	}
 
 	/* invoke per-policy init */
@@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 
 	/* @blkg failed fully initialized, use the usual release path */
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put_css:
-	css_put(&blkcg->css);
 err_free_blkg:
 	if (new_blkg)
 		blkg_free(new_blkg);
 	return ERR_PTR(ret);
 }
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 2/3] crypto: testmgr - test for multi-data-unit dispatch
From: Leonid Ravich @ 2026-06-15 11:14 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Alasdair Kergon, Ard Biesheuvel, Eric Biggers, Jens Axboe,
	Horia Geanta, Gilad Ben-Yossef, linux-crypto, dm-devel,
	linux-block
In-Reply-To: <20260615111459.9452-1-lravich@amazon.com>

Add a test that runs on every skcipher with ivsize == 16.  It
encrypts random plaintext two ways and compares:

  1. one batched request with skcipher_request_set_data_unit_size()
     set, over a deliberately fragmented scatterlist whose entries do
     not align to the data-unit size (so per-DU views cross SG entries
     and exercise the scatter_walk cursor), and
  2. an independent reference of N single-DU requests with IVs walked
     as a 128-bit LE counter, matching the convention documented in
     skcipher_request_set_data_unit_size().

The two must produce byte-identical ciphertext; this pins the IV
convention rather than only checking encrypt/decrypt symmetry.  The
batched ciphertext is then round-tripped back to plaintext, and the
caller IV is checked unchanged.  Iterates over typical data unit
sizes (512, 1024, 2048, 4096).

Algorithms the validator rejects for multi-DU return -EOPNOTSUPP on
the first call and skip cleanly; a genuine mismatch returns -EBADMSG
so it cannot be confused with a skip.

Signed-off-by: Leonid Ravich <lravich@amazon.com>
---
 crypto/testmgr.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 4d86efae65b2..5cbd0f4b070e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3211,6 +3211,194 @@ static int test_skcipher(int enc, const struct cipher_test_suite *suite,
 	return 0;
 }
 
+/* Increment a 16-byte IV as a little-endian 128-bit counter. */
+static void test_mdu_iv_inc(u8 iv[16])
+{
+	int i;
+
+	for (i = 0; i < 16; i++)
+		if (++iv[i])
+			break;
+}
+
+/*
+ * Encrypt one du_size block with a plain single-DU request; used to
+ * build an independent reference for the batched dispatch.
+ */
+static int test_mdu_ref_encrypt(struct crypto_skcipher *tfm, const u8 *in,
+				u8 *out, unsigned int du_size, const u8 iv[16])
+{
+	struct skcipher_request *req;
+	struct scatterlist sg_in, sg_out;
+	DECLARE_CRYPTO_WAIT(wait);
+	u8 ivbuf[16];
+	int err;
+
+	req = skcipher_request_alloc(tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+	memcpy(ivbuf, iv, 16);
+	memcpy(out, in, du_size);
+	sg_init_one(&sg_in, out, du_size);
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+				      CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &wait);
+	skcipher_request_set_crypt(req, &sg_in, &sg_in, du_size, ivbuf);
+	err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+	skcipher_request_free(req);
+	return err;
+}
+
+/*
+ * Build a deliberately fragmented SG over @buf: entries that do not
+ * align to du_size, so the splitter's per-DU views cross SG entries
+ * and exercise the scatter_walk cursor.
+ */
+static void test_mdu_sg_fragment(struct scatterlist *sg, unsigned int nents,
+				 u8 *buf, unsigned int total)
+{
+	unsigned int chunk = total / nents;
+	unsigned int off = 0, i;
+
+	sg_init_table(sg, nents);
+	for (i = 0; i < nents; i++) {
+		unsigned int len = (i == nents - 1) ? total - off : chunk;
+
+		sg_set_buf(&sg[i], buf + off, len);
+		off += len;
+	}
+}
+
+/*
+ * Multi-DU test: verify the batched dispatch produces byte-identical
+ * ciphertext to an independent N x single-DU reference with per-DU IVs
+ * walked as a 128-bit LE counter (pins the IV convention, not just
+ * enc/dec symmetry), over a fragmented SG, then round-trips.  Real
+ * mismatches return -EBADMSG; ineligible algorithms skip via the
+ * validator's -EOPNOTSUPP.
+ */
+#define TEST_MDU_NR_UNITS	4
+#define TEST_MDU_NR_FRAGS	5
+static int test_skcipher_multi_du_one(struct crypto_skcipher *tfm,
+				      unsigned int du_size)
+{
+	const char *driver = crypto_skcipher_driver_name(tfm);
+	const unsigned int total = du_size * TEST_MDU_NR_UNITS;
+	struct skcipher_request *req = NULL;
+	struct scatterlist sg[TEST_MDU_NR_FRAGS];
+	DECLARE_CRYPTO_WAIT(wait);
+	u8 iv_orig[16], iv_work[16], iv_ref[16];
+	u8 *plain = NULL, *buf = NULL, *ref = NULL;
+	unsigned int u;
+	int err;
+
+	plain = kmalloc(total, GFP_KERNEL);
+	buf = kmalloc(total, GFP_KERNEL);
+	ref = kmalloc(total, GFP_KERNEL);
+	req = skcipher_request_alloc(tfm, GFP_KERNEL);
+	if (!plain || !buf || !ref || !req) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	get_random_bytes(plain, total);
+	get_random_bytes(iv_orig, sizeof(iv_orig));
+
+	/* Reference: per-DU single requests with LE128-walked IVs. */
+	memcpy(iv_ref, iv_orig, sizeof(iv_orig));
+	for (u = 0; u < TEST_MDU_NR_UNITS; u++) {
+		err = test_mdu_ref_encrypt(tfm, plain + u * du_size,
+					   ref + u * du_size, du_size, iv_ref);
+		/* First single-DU call reveals an ineligible algorithm. */
+		if (err == -EOPNOTSUPP && u == 0)
+			goto out;
+		if (err) {
+			pr_err("alg: skcipher: %s multi-DU ref encrypt failed (du=%u): %d\n",
+			       driver, du_size, err);
+			goto out;
+		}
+		test_mdu_iv_inc(iv_ref);
+	}
+
+	/* Batched: one request over a fragmented SG. */
+	memcpy(buf, plain, total);
+	memcpy(iv_work, iv_orig, sizeof(iv_orig));
+	test_mdu_sg_fragment(sg, TEST_MDU_NR_FRAGS, buf, total);
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+				      CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &wait);
+	skcipher_request_set_crypt(req, sg, sg, total, iv_work);
+	skcipher_request_set_data_unit_size(req, du_size);
+	err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+	if (err == -EOPNOTSUPP)
+		goto out;
+	if (err) {
+		pr_err("alg: skcipher: %s multi-DU encrypt failed (du=%u): %d\n",
+		       driver, du_size, err);
+		goto out;
+	}
+	if (memcmp(buf, ref, total) != 0) {
+		pr_err("alg: skcipher: %s multi-DU ciphertext differs from single-DU reference (du=%u)\n",
+		       driver, du_size);
+		err = -EBADMSG;
+		goto out;
+	}
+	/* req->iv must be unchanged after multi-DU dispatch. */
+	if (memcmp(iv_work, iv_orig, sizeof(iv_orig)) != 0) {
+		pr_err("alg: skcipher: %s multi-DU encrypt mutated caller IV (du=%u)\n",
+		       driver, du_size);
+		err = -EBADMSG;
+		goto out;
+	}
+
+	/* Round-trip the batched ciphertext back to plaintext. */
+	test_mdu_sg_fragment(sg, TEST_MDU_NR_FRAGS, buf, total);
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+				      CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &wait);
+	skcipher_request_set_crypt(req, sg, sg, total, iv_work);
+	skcipher_request_set_data_unit_size(req, du_size);
+	err = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+	if (err) {
+		pr_err("alg: skcipher: %s multi-DU decrypt failed (du=%u): %d\n",
+		       driver, du_size, err);
+		goto out;
+	}
+	if (memcmp(buf, plain, total) != 0) {
+		pr_err("alg: skcipher: %s multi-DU round-trip mismatch (du=%u)\n",
+		       driver, du_size);
+		err = -EBADMSG;
+	}
+
+out:
+	skcipher_request_free(req);
+	kfree(ref);
+	kfree(buf);
+	kfree(plain);
+	return err;
+}
+
+static int test_skcipher_multi_du(struct crypto_skcipher *tfm)
+{
+	static const unsigned int du_sizes[] = { 512, 1024, 2048, 4096 };
+	unsigned int j;
+	int err;
+
+	if (crypto_skcipher_ivsize(tfm) != 16)
+		return 0;
+
+	for (j = 0; j < ARRAY_SIZE(du_sizes); j++) {
+		err = test_skcipher_multi_du_one(tfm, du_sizes[j]);
+		/* Ineligible algorithms skip; real failures propagate. */
+		if (err == -EOPNOTSUPP)
+			return 0;
+		if (err)
+			return err;
+		cond_resched();
+	}
+	return 0;
+}
+
 static int alg_test_skcipher(const struct alg_test_desc *desc,
 			     const char *driver, u32 type, u32 mask)
 {
@@ -3259,6 +3447,10 @@ static int alg_test_skcipher(const struct alg_test_desc *desc,
 	if (err)
 		goto out;
 
+	err = test_skcipher_multi_du(tfm);
+	if (err)
+		goto out;
+
 	err = test_skcipher_vs_generic_impl(desc->generic_driver, req, tsgls);
 out:
 	free_cipher_test_sglists(tsgls);
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 3/3] dm crypt: batch all sectors of a bio per crypto request
From: Leonid Ravich @ 2026-06-15 11:14 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Alasdair Kergon, Ard Biesheuvel, Eric Biggers, Jens Axboe,
	Horia Geanta, Gilad Ben-Yossef, linux-crypto, dm-devel,
	linux-block
In-Reply-To: <20260615111459.9452-1-lravich@amazon.com>

Submit one skcipher request per bio with
skcipher_request_set_data_unit_size(req, cc->sector_size) instead of
issuing one request per sector.  This removes per-sector overhead in
the crypto API hot path: request allocation, callback dispatch,
completion handling, and SG setup.

The optimisation is enabled automatically at table load when all
of the following hold:

 - the cipher is non-aead (i.e. skcipher), sync, tfms_count 1;
 - the IV mode advertises sector_iv_le128, i.e. its per-sector IV
   advances as a 128-bit LE counter, matching the convention
   documented in skcipher_request_set_data_unit_size().  Only plain64
   sets it today (its 64-bit LE counter extends correctly); plain is
   excluded as its 32-bit counter wraps differently across a
   2^32-sector boundary;
 - ivsize is 16 (the core rejects other sizes with -EOPNOTSUPP);
 - the iv_gen_ops->post() hook is unset;
 - dm-integrity is not stacked (no integrity tag or integrity IV).

The cipher driver does not need to advertise anything: the crypto
API auto-splits multi-data-unit requests for drivers that cannot
handle them natively, so dm-crypt sees the same fast batched
submission contract regardless of the underlying driver.

A new CRYPT_MULTI_DATA_UNIT cipher_flag, set once at construction
time, gates the multi-data-unit dispatch.  The existing per-sector
path in crypt_convert_block_skcipher() is unchanged; the new
crypt_convert_block_skcipher_multi() is reached from a small
dispatch in crypt_convert() and shares the same backlog/-EBUSY/
-EINPROGRESS flow control with the per-sector path.

Heap-allocated scatterlists are stashed in dm_crypt_request and
freed in crypt_free_req_skcipher() to avoid races between the
synchronous-success free path and async-completion reuse from the
request pool.  On scatterlist allocation failure the helper returns
-EAGAIN, and the core returns -EOPNOTSUPP if a driver turns out
unable to do multi-DU; crypt_convert() handles both by clearing its
local multi_du flag and falling back to the per-sector path for the
rest of the current crypt_convert() invocation, ensuring forward progress
on the swap-out-to-dm-crypt path even under total memory exhaustion
(the per-sector path uses only cc->req_pool, a mempool with
reservoir set up at table-load time, and the inline
dmreq->sg_in[]/sg_out[] arrays — no allocation that could fail).

Verified end-to-end with a byte-equivalence test: encrypted output
of plain64 dm-crypt with the multi-data-unit path matches output of
the single-data-unit path bit-for-bit over a 256 MB device, with
xts-aes-aesni driving the auto-split path.

Signed-off-by: Leonid Ravich <lravich@amazon.com>
---
 drivers/md/dm-crypt.c | 215 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 207 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 608b617fb817..bfb98dd876d7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -101,6 +101,9 @@ struct dm_crypt_request {
 	struct scatterlist sg_in[4];
 	struct scatterlist sg_out[4];
 	u64 iv_sector;
+	/* Multi-data-unit SG arrays, NULL when sg_in[]/sg_out[] suffice. */
+	struct scatterlist *sg_in_ext;
+	struct scatterlist *sg_out_ext;
 };
 
 struct crypt_config;
@@ -115,6 +118,12 @@ struct crypt_iv_operations {
 			 struct dm_crypt_request *dmreq);
 	void (*post)(struct crypt_config *cc, u8 *iv,
 		     struct dm_crypt_request *dmreq);
+	/*
+	 * The per-sector IV advances as a 128-bit LE counter, so a bio's
+	 * consecutive sectors share one starting IV and can be batched into
+	 * a single skcipher request via data_unit_size.
+	 */
+	bool sector_iv_le128;
 };
 
 struct iv_benbi_private {
@@ -151,6 +160,7 @@ enum cipher_flags {
 	CRYPT_IV_LARGE_SECTORS,		/* Calculate IV from sector_size, not 512B sectors */
 	CRYPT_ENCRYPT_PREPROCESS,	/* Must preprocess data for encryption (elephant) */
 	CRYPT_KEY_MAC_SIZE_SET,		/* The integrity_key_size option was used */
+	CRYPT_MULTI_DATA_UNIT,		/* Batch all sectors of a bio per crypto request */
 };
 
 /*
@@ -1018,7 +1028,8 @@ static const struct crypt_iv_operations crypt_iv_plain_ops = {
 };
 
 static const struct crypt_iv_operations crypt_iv_plain64_ops = {
-	.generator = crypt_iv_plain64_gen
+	.generator = crypt_iv_plain64_gen,
+	.sector_iv_le128 = true,
 };
 
 static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
@@ -1426,12 +1437,126 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	return r;
 }
 
+/*
+ * Submit all remaining sectors of the current bio in one skcipher request.
+ * Same return convention as crypt_convert_block_skcipher() except for
+ * -EAGAIN, which the caller must treat as "disable multi-DU and re-enter
+ * the per-sector path" so swap-out-to-dm-crypt always makes forward
+ * progress on the mempool reserve.
+ */
+static int crypt_convert_block_skcipher_multi(struct crypt_config *cc,
+					      struct convert_context *ctx,
+					      struct skcipher_request *req,
+					      unsigned int *out_processed)
+{
+	const unsigned int sector_size = cc->sector_size;
+	const gfp_t gfp = GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN;
+	unsigned int total = ctx->iter_in.bi_size;
+	unsigned int n_sg_in = 0, n_sg_out = 0;
+	struct dm_crypt_request *dmreq = dmreq_of_req(cc, req);
+	struct scatterlist *sg_in = NULL, *sg_out = NULL;
+	struct bvec_iter iter_in, iter_out;
+	struct bio_vec bv;
+	u8 *iv, *org_iv;
+	int r;
+
+	if (WARN_ON_ONCE(ctx->iter_in.bi_size != ctx->iter_out.bi_size))
+		return -EIO;
+	if (unlikely(total & (sector_size - 1)))
+		return -EIO;
+
+	iter_in = ctx->iter_in;
+	iter_in.bi_size = total;
+	__bio_for_each_bvec(bv, ctx->bio_in, iter_in, iter_in)
+		n_sg_in++;
+
+	iter_out = ctx->iter_out;
+	iter_out.bi_size = total;
+	__bio_for_each_bvec(bv, ctx->bio_out, iter_out, iter_out)
+		n_sg_out++;
+
+	sg_in = kmalloc_array(n_sg_in, sizeof(*sg_in), gfp);
+	sg_out = (ctx->bio_in == ctx->bio_out) ? sg_in :
+		 kmalloc_array(n_sg_out, sizeof(*sg_out), gfp);
+	if (!sg_in || !sg_out) {
+		kfree(sg_in);
+		if (sg_out != sg_in)
+			kfree(sg_out);
+		return -EAGAIN;
+	}
+
+	sg_init_table(sg_in, n_sg_in);
+	{
+		unsigned int i = 0;
+
+		iter_in = ctx->iter_in;
+		iter_in.bi_size = total;
+		__bio_for_each_bvec(bv, ctx->bio_in, iter_in, iter_in)
+			sg_set_page(&sg_in[i++], bv.bv_page, bv.bv_len,
+				    bv.bv_offset);
+	}
+
+	if (sg_out != sg_in) {
+		unsigned int i = 0;
+
+		sg_init_table(sg_out, n_sg_out);
+		iter_out = ctx->iter_out;
+		iter_out.bi_size = total;
+		__bio_for_each_bvec(bv, ctx->bio_out, iter_out, iter_out)
+			sg_set_page(&sg_out[i++], bv.bv_page, bv.bv_len,
+				    bv.bv_offset);
+	}
+
+	dmreq->iv_sector = ctx->cc_sector;
+	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+		dmreq->iv_sector >>= cc->sector_shift;
+	dmreq->ctx = ctx;
+
+	iv = iv_of_dmreq(cc, dmreq);
+	org_iv = org_iv_of_dmreq(cc, dmreq);
+	r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+	if (r < 0)
+		goto out_free_sg;
+	memcpy(iv, org_iv, cc->iv_size);
+
+	dmreq->sg_in_ext = sg_in;
+	dmreq->sg_out_ext = (sg_out == sg_in) ? NULL : sg_out;
+
+	skcipher_request_set_crypt(req, sg_in, sg_out, total, iv);
+	skcipher_request_set_data_unit_size(req, sector_size);
+
+	if (bio_data_dir(ctx->bio_in) == WRITE)
+		r = crypto_skcipher_encrypt(req);
+	else
+		r = crypto_skcipher_decrypt(req);
+
+	/*
+	 * Sync error: kcryptd_async_done won't run, so free the SG
+	 * arrays here.  Async returns (-EINPROGRESS, -EBUSY) hand
+	 * ownership to the completion callback.
+	 */
+	if (r && r != -EINPROGRESS && r != -EBUSY)
+		goto out_free_sg;
+
+	*out_processed = total;
+	return r;
+
+out_free_sg:
+	kfree(sg_in);
+	if (sg_out != sg_in)
+		kfree(sg_out);
+	dmreq->sg_in_ext = NULL;
+	dmreq->sg_out_ext = NULL;
+	return r;
+}
+
 static void kcryptd_async_done(void *async_req, int error);
 
 static int crypt_alloc_req_skcipher(struct crypt_config *cc,
 				     struct convert_context *ctx)
 {
 	unsigned int key_index = ctx->cc_sector & (cc->tfms_count - 1);
+	struct dm_crypt_request *dmreq;
 
 	if (!ctx->r.req) {
 		ctx->r.req = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO);
@@ -1441,6 +1566,11 @@ static int crypt_alloc_req_skcipher(struct crypt_config *cc,
 
 	skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
 
+	/* Multi-DU SG arrays are owned by the helper that allocates them. */
+	dmreq = dmreq_of_req(cc, ctx->r.req);
+	dmreq->sg_in_ext = NULL;
+	dmreq->sg_out_ext = NULL;
+
 	/*
 	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
 	 * requests if driver request queue is full.
@@ -1487,6 +1617,12 @@ static void crypt_free_req_skcipher(struct crypt_config *cc,
 				    struct skcipher_request *req, struct bio *base_bio)
 {
 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+	struct dm_crypt_request *dmreq = dmreq_of_req(cc, req);
+
+	kfree(dmreq->sg_in_ext);
+	dmreq->sg_in_ext = NULL;
+	kfree(dmreq->sg_out_ext);
+	dmreq->sg_out_ext = NULL;
 
 	if ((struct skcipher_request *)(io + 1) != req)
 		mempool_free(req, &cc->req_pool);
@@ -1515,7 +1651,9 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
 static blk_status_t crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx, bool atomic, bool reset_pending)
 {
-	unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
+	const unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
+	bool multi_du = test_bit(CRYPT_MULTI_DATA_UNIT, &cc->cipher_flags);
+	unsigned int processed;
 	int r;
 
 	/*
@@ -1536,8 +1674,13 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->cc_pending);
 
+		processed = cc->sector_size;
 		if (crypt_integrity_aead(cc))
 			r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, ctx->tag_offset);
+		else if (multi_du)
+			r = crypt_convert_block_skcipher_multi(cc, ctx,
+							       ctx->r.req,
+							       &processed);
 		else
 			r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, ctx->tag_offset);
 
@@ -1559,8 +1702,19 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
 					 * exit and continue processing in a workqueue
 					 */
 					ctx->r.req = NULL;
-					ctx->tag_offset++;
-					ctx->cc_sector += sector_step;
+					if (!multi_du) {
+						ctx->tag_offset++;
+						ctx->cc_sector += sector_step;
+					} else {
+						bio_advance_iter(ctx->bio_in,
+								 &ctx->iter_in,
+								 processed);
+						bio_advance_iter(ctx->bio_out,
+								 &ctx->iter_out,
+								 processed);
+						ctx->cc_sector +=
+							processed >> SECTOR_SHIFT;
+					}
 					return BLK_STS_DEV_RESOURCE;
 				}
 			} else {
@@ -1574,19 +1728,41 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
 		 */
 		case -EINPROGRESS:
 			ctx->r.req = NULL;
-			ctx->tag_offset++;
-			ctx->cc_sector += sector_step;
+			if (!multi_du) {
+				ctx->tag_offset++;
+				ctx->cc_sector += sector_step;
+			} else {
+				bio_advance_iter(ctx->bio_in, &ctx->iter_in,
+						 processed);
+				bio_advance_iter(ctx->bio_out, &ctx->iter_out,
+						 processed);
+				ctx->cc_sector += processed >> SECTOR_SHIFT;
+			}
 			continue;
 		/*
 		 * The request was already processed (synchronously).
 		 */
 		case 0:
 			atomic_dec(&ctx->cc_pending);
-			ctx->cc_sector += sector_step;
-			ctx->tag_offset++;
+			if (!multi_du) {
+				ctx->cc_sector += sector_step;
+				ctx->tag_offset++;
+			} else {
+				bio_advance_iter(ctx->bio_in, &ctx->iter_in,
+						 processed);
+				bio_advance_iter(ctx->bio_out, &ctx->iter_out,
+						 processed);
+				ctx->cc_sector += processed >> SECTOR_SHIFT;
+			}
 			if (!atomic)
 				cond_resched();
 			continue;
+		/* Multi-DU rejected (no memory or sync-only mismatch): fall back. */
+		case -EAGAIN:
+		case -EOPNOTSUPP:
+			atomic_dec(&ctx->cc_pending);
+			multi_du = false;
+			continue;
 		/*
 		 * There was a data integrity error.
 		 */
@@ -3063,6 +3239,29 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
 		}
 	}
 
+	/*
+	 * Enable multi-data-unit batching only when per-DU IVs can be
+	 * derived from one starting IV as a 128-bit LE counter, matching
+	 * skcipher_request_set_data_unit_size().  Only IV modes flagged
+	 * sector_iv_le128 qualify (plain64; not plain, whose 32-bit counter
+	 * wraps differently across a 2^32-sector boundary).  ivsize must be
+	 * 16 (the core rejects otherwise) and the cipher must be sync,
+	 * single-tfm, no integrity, no per-sector post() hook.  The driver
+	 * advertises nothing: the core auto-splits for drivers that lack
+	 * native support.
+	 */
+	if (!crypt_integrity_aead(cc) && cc->tfms_count == 1 &&
+	    cc->iv_gen_ops && cc->iv_gen_ops->sector_iv_le128 &&
+	    !cc->iv_gen_ops->post &&
+	    !cc->integrity_tag_size && !cc->integrity_iv_size &&
+	    crypto_skcipher_ivsize(any_tfm(cc)) == 16 &&
+	    !(crypto_skcipher_alg(any_tfm(cc))->base.cra_flags &
+	      CRYPTO_ALG_ASYNC)) {
+		set_bit(CRYPT_MULTI_DATA_UNIT, &cc->cipher_flags);
+		DMINFO("Using multi-data-unit crypto offload (du=%u)",
+		       cc->sector_size);
+	}
+
 	/* wipe the kernel key payload copy */
 	if (cc->key_string)
 		memset(cc->key, 0, cc->key_size * sizeof(u8));
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 1/3] crypto: skcipher - add per-request data_unit_size with auto-splitting
From: Leonid Ravich @ 2026-06-15 11:14 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Alasdair Kergon, Ard Biesheuvel, Eric Biggers, Jens Axboe,
	Horia Geanta, Gilad Ben-Yossef, linux-crypto, dm-devel,
	linux-block
In-Reply-To: <20260615111459.9452-1-lravich@amazon.com>

Add a data_unit_size field to struct skcipher_request that lets a
caller submit several data units (typically 512..4096-byte sectors)
sharing one starting IV in a single request.  Algorithms derive each
data unit's IV from the caller-supplied IV by treating it as a
128-bit little-endian counter and adding the data-unit index, which
matches the layout produced by dm-crypt's plain64 IV mode and by
typical inline-encryption hardware.

This mirrors the data_unit_size concept already exposed by
struct blk_crypto_config for inline encryption.

The crypto API auto-splits a multi-data-unit request into per-DU
sub-requests when the underlying algorithm does not advertise
CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU (a type-specific cra_flags bit,
defined in crypto/internal/skcipher.h).  A consumer sets
data_unit_size and submits: a native driver handles all units in one
pass, otherwise the core splits transparently.  The split derives
per-DU IVs as a 128-bit LE counter, so this is correct only for
algorithms using that IV convention (e.g. XTS with plain64-style
IVs); callers are responsible for that match, as they already are for
the IV itself.

skcipher_request_set_tfm() resets the field to 0 so a request reused
from a pool or stack defaults to single-data-unit semantics; callers
that want batching set it explicitly via
skcipher_request_set_data_unit_size() after configuring the tfm.

crypto_skcipher_encrypt()/decrypt() call
crypto_skcipher_validate_multi_du() before any algorithm dispatch.
data_unit_size must be a power of two when non-zero (realistic sizes
are 512..4096, letting the per-DU loop and the cryptlen alignment
check use a mask instead of a divide) and cryptlen a positive
multiple of it; a malformed geometry is rejected with -EINVAL.  A
target that cannot do multi-DU - ivsize != SKCIPHER_MDU_IVSIZE (16),
an lskcipher, or an async algorithm without the native flag - is
rejected with -EOPNOTSUPP so a caller can fall back.  Async is
excluded because the splitter dispatches synchronously: an
-EINPROGRESS return would leave later units unsubmitted while the
driver still owned the request's scatterlists and IV.  The check
gates the native path too, so algorithms never see a malformed
multi-DU request.

No in-tree algorithm sets CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU yet;
subsequent patches add the testmgr coverage and the dm-crypt
consumer.

Signed-off-by: Leonid Ravich <lravich@amazon.com>
---
 crypto/skcipher.c                  | 132 +++++++++++++++++++++++++++++
 include/crypto/internal/skcipher.h |  10 +++
 include/crypto/skcipher.h          |  28 ++++++
 3 files changed, 170 insertions(+)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 2b31d1d5d268..9262b47acfb9 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -17,6 +17,7 @@
 #include <linux/cryptouser.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
+#include <linux/log2.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
@@ -432,15 +433,139 @@ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_setkey);
 
+/* IV size for the 128-bit LE-counter multi-data-unit convention. */
+#define SKCIPHER_MDU_IVSIZE	16
+
+static inline void skcipher_iv_inc_le128(u8 *iv)
+{
+	__le64 lo_le, hi_le;
+	u64 lo;
+
+	memcpy(&lo_le, iv, 8);
+	memcpy(&hi_le, iv + 8, 8);
+	lo = le64_to_cpu(lo_le) + 1;
+	lo_le = cpu_to_le64(lo);
+	memcpy(iv, &lo_le, 8);
+	if (unlikely(lo == 0)) {
+		hi_le = cpu_to_le64(le64_to_cpu(hi_le) + 1);
+		memcpy(iv + 8, &hi_le, 8);
+	}
+}
+
+/*
+ * Dispatch a multi-data-unit request as one single-DU sub-request per
+ * unit.  Each unit's IV is the caller's IV plus the unit index, taken
+ * as a 128-bit little-endian counter.  A pair of scatter_walks advances
+ * through src/dst in a single linear pass (O(entries + units)); building
+ * each sub-request's view with scatterwalk_ffwd() would instead rescan
+ * from the head every unit, i.e. O(units^2).
+ */
+static int skcipher_split_data_units(struct skcipher_request *req,
+				     int (*body)(struct skcipher_request *))
+{
+	const unsigned int du = req->data_unit_size;
+	const unsigned int total = req->cryptlen;
+	struct scatterlist *orig_src = req->src;
+	struct scatterlist *orig_dst = req->dst;
+	bool inplace = orig_src == orig_dst;
+	struct scatter_walk src_walk, dst_walk;
+	struct scatterlist src_sg[2], dst_sg[2];
+	u8 iv_orig[SKCIPHER_MDU_IVSIZE];
+	u8 iv_work[SKCIPHER_MDU_IVSIZE];
+	unsigned int off;
+	int err = 0;
+
+	memcpy(iv_orig, req->iv, sizeof(iv_orig));
+	memcpy(iv_work, iv_orig, sizeof(iv_orig));
+
+	sg_init_table(src_sg, 2);
+	scatterwalk_start(&src_walk, orig_src);
+	if (!inplace) {
+		sg_init_table(dst_sg, 2);
+		scatterwalk_start(&dst_walk, orig_dst);
+	}
+
+	/* Stop the per-DU body from re-entering the splitter. */
+	req->data_unit_size = 0;
+	req->src = src_sg;
+	req->dst = inplace ? src_sg : dst_sg;
+
+	for (off = 0; off < total; off += du) {
+		req->cryptlen = du;
+		scatterwalk_get_sglist(&src_walk, src_sg);
+		scatterwalk_skip(&src_walk, du);
+		if (!inplace) {
+			scatterwalk_get_sglist(&dst_walk, dst_sg);
+			scatterwalk_skip(&dst_walk, du);
+		}
+
+		err = body(req);
+		if (err)
+			break;
+
+		skcipher_iv_inc_le128(iv_work);
+		memcpy(req->iv, iv_work, sizeof(iv_work));
+	}
+
+	/* Caller-visible IV is the starting IV regardless of outcome. */
+	memcpy(req->iv, iv_orig, sizeof(iv_orig));
+	req->src = orig_src;
+	req->dst = orig_dst;
+	req->cryptlen = total;
+	req->data_unit_size = du;
+	return err;
+}
+
+static int crypto_skcipher_validate_multi_du(struct skcipher_request *req)
+{
+	const unsigned int du = req->data_unit_size;
+	struct crypto_skcipher *tfm;
+	struct skcipher_alg *alg;
+	u32 cra_flags;
+
+	if (likely(!du))
+		return 0;
+	if (!is_power_of_2(du) || du < SKCIPHER_MDU_IVSIZE)
+		return -EINVAL;
+	if (!req->cryptlen || (req->cryptlen & (du - 1)))
+		return -EINVAL;
+
+	tfm = crypto_skcipher_reqtfm(req);
+	alg = crypto_skcipher_alg(tfm);
+
+	/* lskcipher's *_sg path doesn't honour data_unit_size. */
+	if (alg->co.base.cra_type != &crypto_skcipher_type)
+		return -EOPNOTSUPP;
+
+	/* Capability mismatch, not a malformed request: report -EOPNOTSUPP. */
+	if (crypto_skcipher_ivsize(tfm) != SKCIPHER_MDU_IVSIZE)
+		return -EOPNOTSUPP;
+
+	/* The auto-splitter is sync-only; native drivers own async dispatch. */
+	cra_flags = alg->co.base.cra_flags;
+	if ((cra_flags & CRYPTO_ALG_ASYNC) &&
+	    !(cra_flags & CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 int crypto_skcipher_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
+	int err;
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		return -ENOKEY;
+	err = crypto_skcipher_validate_multi_du(req);
+	if (err)
+		return err;
 	if (alg->co.base.cra_type != &crypto_skcipher_type)
 		return crypto_lskcipher_encrypt_sg(req);
+	if (req->data_unit_size &&
+	    !(alg->co.base.cra_flags & CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU))
+		return skcipher_split_data_units(req, alg->encrypt);
 	return alg->encrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt);
@@ -449,11 +574,18 @@ int crypto_skcipher_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
+	int err;
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		return -ENOKEY;
+	err = crypto_skcipher_validate_multi_du(req);
+	if (err)
+		return err;
 	if (alg->co.base.cra_type != &crypto_skcipher_type)
 		return crypto_lskcipher_decrypt_sg(req);
+	if (req->data_unit_size &&
+	    !(alg->co.base.cra_flags & CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU))
+		return skcipher_split_data_units(req, alg->decrypt);
 	return alg->decrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt);
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index a965b6aabf61..4c826f3bc715 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -21,6 +21,16 @@
  */
 #define CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE CRYPTO_ALG_OPTIONAL_KEY
 
+/*
+ * Set by an skcipher that handles skcipher_request::data_unit_size > 0
+ * natively in one pass; otherwise the API splits the request.  Lives in
+ * the type-specific 0xff000000 cra_flags range.  A native driver must
+ * derive per-DU IVs as a 128-bit LE counter and leave @iv at the
+ * caller-supplied starting value on return, success or error, matching
+ * the auto-splitter so the two paths are observably identical.
+ */
+#define CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU	0x01000000
+
 struct aead_request;
 struct rtattr;
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 4efe2ca8c4d1..ced1fae08147 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -31,6 +31,11 @@ struct scatterlist;
 /**
  *	struct skcipher_request - Symmetric key cipher request
  *	@cryptlen: Number of bytes to encrypt or decrypt
+ *	@data_unit_size: Size in bytes of each data unit, or 0 for a
+ *		single-data-unit request (the default).  When non-zero,
+ *		must be a power of two, @cryptlen must be a positive
+ *		multiple of it, and per-DU IVs are derived from @iv as a
+ *		128-bit little-endian counter.
  *	@iv: Initialisation Vector
  *	@src: Source SG list
  *	@dst: Destination SG list
@@ -39,6 +44,7 @@ struct scatterlist;
  */
 struct skcipher_request {
 	unsigned int cryptlen;
+	unsigned int data_unit_size;
 
 	u8 *iv;
 
@@ -225,6 +231,7 @@ struct lskcipher_alg {
 	struct skcipher_request *name = \
 		(((struct skcipher_request *)__##name##_desc)->base.tfm = \
 			crypto_sync_skcipher_tfm((_tfm)), \
+		 ((struct skcipher_request *)__##name##_desc)->data_unit_size = 0, \
 		 (void *)__##name##_desc)
 
 /**
@@ -819,6 +826,8 @@ static inline void skcipher_request_set_tfm(struct skcipher_request *req,
 					    struct crypto_skcipher *tfm)
 {
 	req->base.tfm = crypto_skcipher_tfm(tfm);
+	/* Reused requests default to single-data-unit. */
+	req->data_unit_size = 0;
 }
 
 static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req,
@@ -937,5 +946,24 @@ static inline void skcipher_request_set_crypt(
 	req->iv = iv;
 }
 
+/**
+ * skcipher_request_set_data_unit_size() - submit as multiple data units
+ * @req: request handle
+ * @data_unit_size: data-unit size in bytes (power of two), or 0 to disable
+ *
+ * Process @req as @cryptlen / @data_unit_size data units sharing one starting
+ * @iv, with per-DU IVs derived as a 128-bit little-endian counter.  @cryptlen
+ * must be a positive multiple of @data_unit_size, else the encrypt/decrypt
+ * call returns -EINVAL; a target that cannot do multi-DU (ivsize != 16, an
+ * lskcipher, or async without native support) returns -EOPNOTSUPP.  Unlike
+ * the single-DU path, @iv is preserved across the call regardless of outcome.
+ */
+static inline void
+skcipher_request_set_data_unit_size(struct skcipher_request *req,
+				    unsigned int data_unit_size)
+{
+	req->data_unit_size = data_unit_size;
+}
+
 #endif	/* _CRYPTO_SKCIPHER_H */
 

base-commit: a8cafdf8c949f17c92eca0045532e88ac0dac30d
-- 
2.47.3


^ permalink raw reply related

* [PATCH v4 0/3] crypto: skcipher - per-request multi-data-unit batching
From: Leonid Ravich @ 2026-06-15 11:14 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Alasdair Kergon, Ard Biesheuvel, Eric Biggers, Jens Axboe,
	Horia Geanta, Gilad Ben-Yossef, linux-crypto, dm-devel,
	linux-block

This is v4, addressing Herbert's review of v3.  Two architectural
changes:

  - data_unit_size is now per-request (on struct skcipher_request)
    rather than per-tfm.  Reverts to the v1 placement.

  - The crypto API auto-splits multi-data-unit requests when the
    underlying algorithm does not advertise
    CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU.  Consumers no longer test
    for multi-DU support before submitting; setting data_unit_size
    on any skcipher request whose algorithm uses the 128-bit LE
    counter IV convention "just works".

These two changes shrink the series from 4 patches to 3 (the
generic xts(...) template needs no special handling - the
auto-splitter calls its single-DU encrypt/decrypt once per data
unit) and simplify the dm-crypt consumer (no advertise-flag check,
no per-tfm setup).

v3: https://lore.kernel.org/linux-crypto/20260601085641.16028-1-lravich@amazon.com/
v2: https://lore.kernel.org/linux-crypto/20260527065021.19525-1-lravich@amazon.com/
v1: https://lore.kernel.org/linux-crypto/20260519115955.27267-1-lravich@amazon.com/

The series adds a per-request "data unit size" to the skcipher API
so a caller can submit several data units (typically 512..4096-byte
sectors) sharing one starting IV in a single request.  Algorithms
derive each data unit's IV from the caller-supplied IV by treating
it as a 128-bit little-endian counter and adding the data-unit
index, matching the layout produced by dm-crypt's plain64 IV mode
and by typical inline-encryption hardware.

This mirrors the data_unit_size concept already exposed by
struct blk_crypto_config for inline encryption.

The first user is dm-crypt, which today issues one skcipher request
per sector and so pays a per-sector cost in request allocation,
callback dispatch, completion handling, and scatterlist setup.

Proof-of-concept performance numbers from the RFC reply [1]: +19%
throughput / -40% CPU on a single-core arm64 system with a hardware
XTS-AES-256 accelerator running fio 4 KiB sequential writes through
dm-crypt, when an out-of-tree arm64 xts driver advertises
CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU.  This series itself does not
include arch enablement; the fast path is opt-in per driver, the
slow path is universal via the auto-splitter.

The native fast path amortises both per-sector dispatch and per-sector
crypto setup across a bio - the measured win above, on an engine that
offloads the AES compute.  The auto-splitter is for correctness and
reach: any consumer can set data_unit_size and get correct output with
the per-request allocation/callback/completion cost removed, but it
still issues one alg->encrypt per data unit, so on a software cipher it
saves only dispatch overhead (no throughput figure claimed - that is
hardware- and workload-dependent).  What it guarantees unconditionally
is byte-identical output (Verification below) at O(entries + units),
walking the scatterlists with a pair of struct scatter_walk cursors
rather than rescanning from the head per unit.

[1] https://lore.kernel.org/linux-crypto/20260428101225.24316-1-lravich@amazon.com/

Changes since v3
----------------

- data_unit_size moved from struct crypto_skcipher (per-tfm) to
  struct skcipher_request (per-request).  (Herbert)

- Crypto API auto-splits multi-data-unit requests when the algorithm
  does not advertise CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU.  Drops the
  per-tfm setter/probe in favour of a single
  skcipher_request_set_data_unit_size() usable by every consumer.
  (Herbert)

- CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU is a type-specific cra_flags
  bit (0x01000000) in crypto/internal/skcipher.h, not a generic bit
  in the public header; drivers set it to opt OUT of auto-splitting.

- The auto-splitter advances through src/dst with a pair of struct
  scatter_walk cursors (scatterwalk_start / scatterwalk_get_sglist /
  scatterwalk_skip) instead of scatterwalk_ffwd() per unit, which
  rescans from the head and is O(units^2) under fragmentation; the
  cursors give a single linear pass.  (Eric)

- crypto_skcipher_validate_multi_du() reports -EINVAL for a malformed
  geometry (du not a power of two, cryptlen not a positive multiple)
  and -EOPNOTSUPP for a target that cannot do multi-DU (ivsize != 16,
  lskcipher, or async without the native flag), so a caller can fall
  back.  Gates the native path too, not just the auto-splitter.
  (Eric)

- testmgr cross-checks the batched dispatch against an independent
  N x single-DU reference with LE128-walked IVs over a fragmented
  scatterlist (pins the IV convention and exercises the cursor),
  round-trips, and checks IV preservation.  Ineligible algorithms
  skip via -EOPNOTSUPP; a real mismatch returns -EBADMSG.

- dm-crypt enables batching only for IV modes flagged sector_iv_le128
  (a new bool on struct crypt_iv_operations, set on plain64 only),
  plus ivsize 16, sync, single-tfm, no integrity, no post() hook.  The
  flag replaces a hardcoded plain64 pointer-compare, so eligibility is
  a self-documenting property of the IV mode rather than a special
  case.  plain stays excluded (its 32-bit counter wraps differently
  past 2^32 sectors).  Sets req->data_unit_size = sector_size and
  submits; -EOPNOTSUPP/-EAGAIN fall back to the per-sector path.
  Mikulas's v2 Reviewed-by is dropped as the dm-crypt patch was
  substantially rewritten.

- The generic xts(...) template needs no separate handling, dropping
  the v3 crypto/xts.c patch (4 -> 3 patches).

Design overview
---------------

* Patch 1 adds the data_unit_size field, the setter, the
  CRYPTO_ALG_SKCIPHER_NATIVE_MULTI_DU flag, and the auto-splitter in
  crypto_skcipher_encrypt()/decrypt().  skcipher_request_set_tfm()
  resets the field so a reused request defaults to single-DU.

* Patch 2 adds the testmgr multi-DU test (every ivsize == 16
  skcipher).

* Patch 3 turns dm-crypt batching on automatically under the
  conditions above and sets req->data_unit_size = cc->sector_size.

This series does NOT add the capability flag to any arch driver; the
auto-splitter ensures correctness without that opt-in.

Verification
------------

A regression protocol is included in the project tree
(.claude/regression-protocol.md, .claude/run-regression.sh).  The
reference run reports 12/12 PASS:

  - x86 + arm64 build clean; checkpatch.pl --strict clean.
  - testmgr multi-DU: PASS for every ivsize == 16 skcipher in-tree.
  - dm-crypt activation gating: plain64 enabled; essiv:sha256 /
    plain64be / plain fall back.
  - dm-crypt round-trip plain64 with multi-DU via the auto-splitter
    (xts-aes-aesni, no native flag): PASS.
  - dm-crypt round-trip essiv:sha256 (per-sector path): PASS.
  - dm-crypt low-memory (mem=128M): PASS, no OOM kill.
  - Byte-equivalence: 256 MB of ciphertext through the auto-splitter
    is bit-identical to an unpatched axboe/for-next baseline (sha256
    4913910b1aa6f8859fcb8f4adec20230274993a3ade8f4dd0140a323dc43efc0).
  - arm64 functional under qemu-aarch64: PASS.



Leonid Ravich (3):
  crypto: skcipher - add per-request data_unit_size with auto-splitting
  crypto: testmgr - test for multi-data-unit dispatch
  dm crypt: batch all sectors of a bio per crypto request

 crypto/skcipher.c                  | 132 +++++++++++++++++++
 crypto/testmgr.c                   | 192 +++++++++++++++++++++++++
 drivers/md/dm-crypt.c              | 215 +++++++++++++++++++++++++++--
 include/crypto/internal/skcipher.h |  10 ++
 include/crypto/skcipher.h          |  28 ++++
 5 files changed, 569 insertions(+), 8 deletions(-)


base-commit: a8cafdf8c949f17c92eca0045532e88ac0dac30d
--
2.47.3


^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Thorsten Leemhuis @ 2026-06-15 10:34 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, linux-block, dm-devel
  Cc: Linux kernel regressions list
In-Reply-To: <ai7rnH20IYeSmY8s@gallifrey>

On 6/14/26 19:57, Dr. David Alan Gilbert wrote:
>
>   I've got a repeatable raid hang/warn and would appreciate some pointers
> as where to debug.
>   (I've been logging stuff on  https://bugzilla.kernel.org/show_bug.cgi?id=221535 )

Note: not my area of expertise, so I might be sending you totally
off-track with this comment. Feel free to ignore it. But FWIW:

Have you seen these reports?
https://lore.kernel.org/all/2982107.4sosBPzcNG@electra/
https://lore.kernel.org/all/CAC_j7i1R7oy+nRhxEjCTba=DUgn02w9X+p94DCu0aHv5+5tKnQ@mail.gmail.com/

The former lead to a fix in the mdraid code that should be in the kernel
version you are using. But in a reply to the latter report the repoter
claimed that that fix is not enough (claiming "this was obvious" and
also using dm), but things then stalled there.

Ciao, Thorsten

>   This started off as debugging a case where I'd get my RAID1 (on the host)
> getting a reliable 'rescheduling sector'/disk failure while running the qemu block test suite
> during a qemu build, but then I tried to build a smaller discrete
> test, and now I've got a simply triggerable warn and test hang.
> There's no errors from the underlying SATA layer on the storage,
> everything resyncs just fine.
> 
> I've got an existing LVM vg ('main') with two mirrors on sda2, and sdb2
> which are SATA disks.
> 
> # lvcreate --type mirror --mirrors 1 -L 1G main /dev/sda2 /dev/sdb2
> # mkfs.ext4 /dev/mapper/main-lvol0
> # mount /dev/mapper/main-lvol0 /mnt/tmp/
> # chmod a+rwx /mnt/tmp
> 
> $ dd if=/dev/zero of=/mnt/tmp/testfile bs=1024k count=1
> 
> (I then wait for the IO to stop)
> 
> then we've got this little test program:
> 
> <--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->
> #include <errno.h>
> #include <fcntl.h>             
> #include <asm-generic/fcntl.h>
> #include <stdio.h> 
> #include <unistd.h>
> 
> 
> const char* path="/mnt/tmp/testfile";
> static char buf[8192];
> 
> int main()                                       
> {
>   int fd=open(path, O_RDWR|O_DIRECT|O_CLOEXEC);
>     
>   errno=0;
>   int res3=pread(fd, buf, 4096, 0);
>   printf("pread of 4096 said: %d (%m)\n", res3);
> 
> }
> <--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->
> 
> running that, either hangs or gets a 'pread of 4096 said: -1 (Input/output error)'
> when it hangs it's unkillable.
> 
> at the moment (on 7.1.0-rc7) this is giving:
> Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
> Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369
> 
> (full backtrace below)
> (Note there is a moan in there about sdb IO error - repeated a lot - but
> again, there's no SATA level errors, and the drive is fine on smart, and
> I can read the whole of the underlying lvm mirrors, so I don't think it's
> physically there).
> 
> I did a blktrace, although that gives me a 23G blkparse output, hmm
> (I see each event repeated a lot - maybe per thread?)
> 
> 252,26  15        1     0.000000000  3435  Q  RS 264192 + 8 [dbf]
>   252,26 is /dev/mapper/main-lvol0
> 252,24  15        1     0.000005501  3435  A  RS 264192 + 8 <- (252,26) 264192
>   252,24 is main-lvol0_mimage_0
> 252,24  15        2     0.000005761  3435  Q  RS 264192 + 8 [dbf]
>   8,0   15        1     0.000008646  3435  A  RS 71634944 + 8 <- (252,24) 264192
>     so that's sda 
>   8,0   15        2     0.000008787  3435  A  RS 73734144 + 8 <- (8,2) 71634944
>     I guess mapping down from sda2 to sda
>   8,0   15        3     0.000009037  3435  Q  RS 73734144 + 8 [dbf]
>   8,0   15        4     0.000009809  3435  C  RS 73734144 + 8 [65514]
>       ??? Hmm what's the 65514 there?
> 252,24  15        3     0.000010320  3435  C  RS 264192 + 8 [65514]
> 252,25  15        1     0.000290384   369  Q   R 264192 + 8 [kworker/15:1]
>    252,25 is main-lvol0_mimage_1
> 
> and at this point I'm a bit lost as to what I'm looking for.
> 
> Hints appreciated!
> 
> (I don't believe this is a regression - or at least not recent)
> 
> Dave
> 
> 
> 
> 
> Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
> Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369
> Jun 14 18:08:32 dalek dmeventd[1010]: main-lvol0 is now in-sync.
> Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
> Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
> Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Not tainted 7.1.0-rc7+ #786 PREEMPT(lazy) 
> Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
> Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
> Jun 14 18:08:32 dalek kernel: RIP: 0010:bio_add_page+0x18b/0x250
> Jun 14 18:08:32 dalek kernel: Code: 24 10 4c 8b 04 24 84 c0 0f 85 c9 00 00 00 41 0f b7 40 78 48 8b 74 24 08 8b 4c 24 14 e9 b4 fe ff ff 0f 0b 31 c0 e9 55 d1 af 00 <0f> 0b eb f5 48 8b 7f 08 83 7f 60 05 0f 85 00 ff ff ff 49 8b 3b 4c
> Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176fc10 EFLAGS: 00010246
> Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffffd1fb8176fd18 RCX: 0000000000000000
> Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8d1a8eb28b00
> Jun 14 18:08:32 dalek kernel: RBP: 0000000000000000 R08: ffffd1fb8176fc38 R09: ffffd1fb8176fc40
> Jun 14 18:08:32 dalek kernel: R10: ffffd1fb8176fc34 R11: 0000000000000000 R12: 0000000000000000
> Jun 14 18:08:32 dalek kernel: R13: ffffd1fb8176fd90 R14: 0000000000000001 R15: ffff8d1a8eb28b00
> Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
> Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
> Jun 14 18:08:32 dalek kernel: Call Trace:
> Jun 14 18:08:32 dalek kernel:  <TASK>
> Jun 14 18:08:32 dalek kernel:  do_region+0x227/0x2a0
> Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
> Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
> Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
> Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
> Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
> Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
> Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
> Jun 14 18:08:32 dalek kernel:  </TASK>
> Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
> Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
> Jun 14 18:08:32 dalek kernel: WARNING: drivers/scsi/scsi_lib.c:1164 at scsi_alloc_sgtables+0x38a/0x400, CPU#15: kworker/15:1/369
> Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
> Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
> Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Tainted: G        W           7.1.0-rc7+ #786 PREEMPT(lazy) 
> Jun 14 18:08:32 dalek kernel: Tainted: [W]=WARN
> Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
> Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
> Jun 14 18:08:32 dalek kernel: RIP: 0010:scsi_alloc_sgtables+0x38a/0x400
> Jun 14 18:08:32 dalek kernel: Code: 8b 3d ba 2d a9 01 e9 d1 fd ff ff 48 8b 75 00 48 8d bb f0 fe ff ff e8 15 b7 b0 ff 48 89 ab e0 00 00 00 89 45 08 e9 30 ff ff ff <0f> 0b 4c 8b 6c 24 30 b8 0a 00 00 00 e9 21 ff ff ff b8 09 00 00 00
> Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176f7f0 EFLAGS: 00010246
> Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffff8d1aedad0110 RCX: 0000000000000009
> Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: ffffffff99c15960 RDI: ffff8d1aedad0110
> Jun 14 18:08:32 dalek kernel: RBP: ffff8d1a93d17000 R08: ffff8d1aedad0110 R09: ffff8d1a818fa800
> Jun 14 18:08:32 dalek kernel: R10: 7020676e69736961 R11: 0000000000000000 R12: 0000000000000000
> Jun 14 18:08:32 dalek kernel: R13: 0000000000000000 R14: ffff8d1a93394000 R15: ffff8d1a93d17000
> Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
> Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
> Jun 14 18:08:32 dalek kernel: Call Trace:
> Jun 14 18:08:32 dalek kernel:  <TASK>
> Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> Jun 14 18:08:32 dalek kernel:  sd_setup_read_write_cmnd+0x9d/0x740
> Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> Jun 14 18:08:32 dalek kernel:  scsi_queue_rq+0x4d2/0x890
> Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_rq_list+0x241/0x530
> Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> Jun 14 18:08:32 dalek kernel:  ? sbitmap_get+0x61/0x100
> Jun 14 18:08:32 dalek kernel:  __blk_mq_do_dispatch_sched+0x330/0x340
> Jun 14 18:08:32 dalek kernel:  __blk_mq_sched_dispatch_requests+0x143/0x180
> Jun 14 18:08:32 dalek kernel:  blk_mq_sched_dispatch_requests+0x2d/0x70
> Jun 14 18:08:32 dalek kernel:  blk_mq_run_hw_queue+0x2bf/0x350
> Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_list+0x172/0x350
> Jun 14 18:08:32 dalek kernel:  blk_mq_flush_plug_list+0x51/0x1a0
> Jun 14 18:08:32 dalek kernel:  ? blk_mq_submit_bio+0x71c/0x9f0
> Jun 14 18:08:32 dalek kernel:  __blk_flush_plug+0x112/0x180
> Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
> Jun 14 18:08:32 dalek kernel:  __submit_bio+0x19c/0x260
> Jun 14 18:08:32 dalek kernel:  __submit_bio_noacct+0x8e/0x210
> Jun 14 18:08:32 dalek kernel:  do_region+0x14c/0x2a0
> Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
> Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
> Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
> Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
> Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
> Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
> Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
> Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
> Jun 14 18:08:32 dalek kernel:  </TASK>
> Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> Jun 14 18:08:37 dalek kernel: blk_print_req_error: 241000 callbacks suppressed
> Jun 14 18:08:37 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
> 
> 


^ permalink raw reply

* [PATCH] block: genhd: Add NULL check for kobject_create_and_add in genhd_device_init
From: Li Jun @ 2026-06-15 10:15 UTC (permalink / raw)
  To: lijun01, axboe, linux-block

The kobject_create_and_add() call in genhd_device_init() may return NULL
if memory allocation fails, but the return value was not being checked.
This could lead to NULL pointer dereferences in subsequent calls to
sysfs_create_link() and sysfs_remove_link() which use block_depr.

Add proper error checking and cleanup path to handle the case when
kobject_create_and_add() fails.

Fixes: 721da5cee9d4 ("driver core: remove CONFIG_SYSFS_DEPRECATED
	and CONFIG_SYSFS_DEPRECATED_V2")
Signed-off-by: Li Jun <lijun01@kylinos.cn>
---
 block/genhd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 7d4ee5972338..60569d59cd53 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1005,7 +1005,15 @@ static int __init genhd_device_init(void)
 
 	/* create top-level block dir */
 	block_depr = kobject_create_and_add("block", NULL);
+	if (!block_depr) {
+		error = -ENOMEM;
+		goto out_class_unregister;
+	}
 	return 0;
+
+out_class_unregister:
+	class_unregister(&block_class);
+	return error;
 }
 
 subsys_initcall(genhd_device_init);
-- 
2.25.1


^ permalink raw reply related

* [PATCH blktests] ublk: mark all tests as QUICK
From: Sebastian Chlad @ 2026-06-15  9:41 UTC (permalink / raw)
  To: shinichiro.kawasaki; +Cc: linux-block, Sebastian Chlad

These tests are quick to run so mark them accordingly to ensure
they are included in quick runs.

Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
---

I checked locally - all tests execute well below 10 seconds

 tests/ublk/001 | 1 +
 tests/ublk/002 | 1 +
 tests/ublk/003 | 1 +
 tests/ublk/004 | 1 +
 tests/ublk/005 | 1 +
 tests/ublk/006 | 1 +
 6 files changed, 6 insertions(+)

diff --git a/tests/ublk/001 b/tests/ublk/001
index 3435316..c994cff 100755
--- a/tests/ublk/001
+++ b/tests/ublk/001
@@ -7,6 +7,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test ublk delete"
+QUICK=1
 
 _run() {
 	local type=$1
diff --git a/tests/ublk/002 b/tests/ublk/002
index ca357b6..aaea4a7 100755
--- a/tests/ublk/002
+++ b/tests/ublk/002
@@ -7,6 +7,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test ublk crash with delete after dead confirmation"
+QUICK=1
 
 _run() {
 	local type=$1
diff --git a/tests/ublk/003 b/tests/ublk/003
index e366813..40bbd6f 100755
--- a/tests/ublk/003
+++ b/tests/ublk/003
@@ -7,6 +7,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test mounting block device exported by ublk"
+QUICK=1
 
 requires() {
 	_have_program mkfs.ext4
diff --git a/tests/ublk/004 b/tests/ublk/004
index 1d74fea..6812431 100755
--- a/tests/ublk/004
+++ b/tests/ublk/004
@@ -7,6 +7,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test ublk crash with delete just after daemon kill"
+QUICK=1
 
 _run() {
 	local type=$1
diff --git a/tests/ublk/005 b/tests/ublk/005
index 1e21674..69c1fca 100755
--- a/tests/ublk/005
+++ b/tests/ublk/005
@@ -9,6 +9,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test ublk recovery with one time daemon kill"
+QUICK=1
 
 _run() {
 	local type=$1
diff --git a/tests/ublk/006 b/tests/ublk/006
index 85087bd..2a4d886 100755
--- a/tests/ublk/006
+++ b/tests/ublk/006
@@ -9,6 +9,7 @@
 . tests/ublk/rc
 
 DESCRIPTION="test ublk recovery with two times daemon kill"
+QUICK=1
 
 _run() {
 	local type=$1
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH] block: check bio split for unaligned bvec
From: Carlos Maiolino @ 2026-06-15  9:37 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, axboe, hch, Keith Busch
In-Reply-To: <20260612223205.465913-1-kbusch@meta.com>

On Fri, Jun 12, 2026 at 03:32:04PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Offsets and lengths need to be validated against the dma alignment. This
> check was skipped for sufficiently a small bio with a single bvec, which
> may allow an invalid request dispatched to the driver. Force the
> validation for an unaligned bvec by forcing the bio split path that
> handles this condition.
> 
> Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
> Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
> Reported-by: Carlos Maiolino <cem@kernel.org>
> Signed-off-by: Keith Busch <kbusch@kernel.org>

Jens was quick enough but if needed anyway, I've tested this locally,
so:

Tested-by: Carlos Maiolino <cem@kernel.org>
Reviewed-by: Carlos Maiolino <cem@kernel.org>

> ---
>  block/blk.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/block/blk.h b/block/blk.h
> index 1a2d9101bba04..004048fa0c5a8 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -404,6 +404,8 @@ static inline bool bio_may_need_split(struct bio *bio,
>  	bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
>  	if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done)
>  		return true;
> +	if ((bv->bv_offset | bv->bv_len) & lim->dma_alignment)
> +		return true;
>  	return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size;
>  }
>  
> -- 
> 2.52.0
> 

^ permalink raw reply

* Re: [PATCH v5 5/9] block: implement NVMEM provider
From: Loic Poulain @ 2026-06-15  9:33 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
	linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
	Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
	Jeff Johnson, Marcel Holtmann, Luiz Augusto von Dentz,
	Balakrishna Godavarthi, Rocky Liao, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Srinivas Kandagatla,
	Andrew Lunn, Heiner Kallweit, Russell King, Saravana Kannan
In-Reply-To: <CAFEp6-0qsqhcwnSjm3=bG21jsCktzn5-L5sk2pNTZcGuVXaiNA@mail.gmail.com>

On Mon, Jun 15, 2026 at 11:28 AM Loic Poulain
<loic.poulain@oss.qualcomm.com> wrote:
>
> On Mon, Jun 15, 2026 at 10:53 AM Bartosz Golaszewski <brgl@kernel.org> wrote:
> >
> > On Fri, 12 Jun 2026 15:20:57 +0200, Loic Poulain
> > <loic.poulain@oss.qualcomm.com> said:
> > > From: Daniel Golle <daniel@makrotopia.org>
> > >
> > > On embedded devices using an eMMC it is common that one or more partitions
> > > on the eMMC are used to store MAC addresses and Wi-Fi calibration EEPROM
> > > data. Allow referencing the partition in device tree for the kernel and
> > > Wi-Fi drivers accessing it via the NVMEM layer.
> > >
> > > For now, NVMEM is only registered for the whole disk block device, as the
> > > OF node is currently only associated to it.
> > >
> > > Signed-off-by: Daniel Golle <daniel@makrotopia.org>
> > > Co-developed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> > > Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> > > ---
> > >  block/Kconfig             |   9 ++++
> > >  block/Makefile            |   1 +
> > >  block/blk-nvmem.c         | 109 ++++++++++++++++++++++++++++++++++++++++++++++
> > >  block/blk.h               |   8 ++++
> > >  block/genhd.c             |   4 ++
> > >  include/linux/blk_types.h |   3 ++
> > >  include/linux/blkdev.h    |   1 +
> > >  7 files changed, 135 insertions(+)
> > >
> > > diff --git a/block/Kconfig b/block/Kconfig
> > > index 15027963472d7b40e27b9097a5993c457b5b3054..0b33747e16dc33473683706f75c92bdf8b648f7c 100644
> > > --- a/block/Kconfig
> > > +++ b/block/Kconfig
> > > @@ -209,6 +209,15 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
> > >         by falling back to the kernel crypto API when inline
> > >         encryption hardware is not present.
> > >
> > > +config BLK_NVMEM
> > > +     bool "Block device NVMEM provider"
> > > +     depends on OF
> > > +     depends on NVMEM
> > > +     help
> > > +       Allow block devices (or partitions) to act as NVMEM providers,
> > > +       typically used with eMMC to store MAC addresses or Wi-Fi
> > > +       calibration data on embedded devices.
> > > +
> > >  source "block/partitions/Kconfig"
> > >
> > >  config BLK_PM
> > > diff --git a/block/Makefile b/block/Makefile
> > > index 7dce2e44276c4274c11a0a61121c83d9c43d6e0c..d7ac389e71902bc091a8800ea266190a43b3e63d 100644
> > > --- a/block/Makefile
> > > +++ b/block/Makefile
> > > @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
> > >                                          blk-crypto-sysfs.o
> > >  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
> > >  obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)        += holder.o
> > > +obj-$(CONFIG_BLK_NVMEM)                += blk-nvmem.o
> > > diff --git a/block/blk-nvmem.c b/block/blk-nvmem.c
> > > new file mode 100644
> > > index 0000000000000000000000000000000000000000..c005f059d9fe56242ebaef9905673dff902b5686
> > > --- /dev/null
> > > +++ b/block/blk-nvmem.c
> > > @@ -0,0 +1,109 @@
> > > +// SPDX-License-Identifier: GPL-2.0-or-later
> > > +/*
> > > + * block device NVMEM provider
> > > + *
> > > + * Copyright (c) 2024 Daniel Golle <daniel@makrotopia.org>
> > > + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> > > + *
> > > + * Useful on devices using a partition on an eMMC for MAC addresses or
> > > + * Wi-Fi calibration EEPROM data.
> > > + */
> > > +
> > > +#include <linux/file.h>
> > > +#include <linux/nvmem-provider.h>
> > > +#include <linux/nvmem-consumer.h>
> > > +#include <linux/of.h>
> > > +#include <linux/pagemap.h>
> > > +#include <linux/property.h>
> > > +
> > > +#include "blk.h"
> > > +
> > > +static int blk_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes)
> > > +{
> > > +     blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES;
> > > +     dev_t devt = (dev_t)(uintptr_t)priv;
> > > +     size_t bytes_left = bytes;
> > > +     loff_t pos = from;
> > > +     int ret = 0;
> > > +
> > > +     struct file *bdev_file __free(fput) = bdev_file_open_by_dev(devt, mode, priv, NULL);
> > > +     if (IS_ERR(bdev_file))
> > > +             return PTR_ERR(bdev_file);
> > > +
> > > +     while (bytes_left) {
> > > +             pgoff_t f_index = pos >> PAGE_SHIFT;
> > > +             struct folio *folio;
> > > +             size_t folio_off;
> > > +             size_t to_read;
> > > +
> > > +             folio = read_mapping_folio(bdev_file->f_mapping, f_index, NULL);
> > > +             if (IS_ERR(folio)) {
> > > +                     ret = PTR_ERR(folio);
> > > +                     break;
> > > +             }
> > > +
> > > +             folio_off = offset_in_folio(folio, pos);
> > > +             to_read = min(bytes_left, folio_size(folio) - folio_off);
> > > +             memcpy_from_folio(val, folio, folio_off, to_read);
> > > +             pos += to_read;
> > > +             bytes_left -= to_read;
> > > +             val += to_read;
> > > +             folio_put(folio);
> > > +     }
> > > +
> > > +     return ret;
> > > +}
> > > +
> > > +void blk_nvmem_add(struct block_device *bdev)
> > > +{
> > > +     struct device *dev = &bdev->bd_device;
> > > +     struct nvmem_config config = {};
> > > +
> > > +     /* skip devices which do not have a device tree node */
> > > +     if (!dev_of_node(dev))
> > > +             return;
> > > +
> > > +     /* skip devices without an nvmem layout defined */
> > > +     struct device_node *child __free(device_node) =
> > > +             of_get_child_by_name(dev_of_node(dev), "nvmem-layout");
> > > +     if (!child)
> > > +             return;
> > > +
> > > +     /*
> > > +      * skip block device too large to be represented as NVMEM devices,
> > > +      * the NVMEM reg_read callback uses an unsigned int offset
> > > +      */
> > > +     if (bdev_nr_bytes(bdev) > UINT_MAX) {
> > > +             dev_warn(dev, "block device too large to be an NVMEM provider\n");
> > > +             return;
> > > +     }
> > > +
> > > +     config.id = NVMEM_DEVID_NONE;
> > > +     config.dev = dev;
> > > +     config.name = dev_name(dev);
> > > +     config.owner = THIS_MODULE;
> > > +     config.priv = (void *)(uintptr_t)dev->devt;
> > > +     config.reg_read = blk_nvmem_reg_read;
> > > +     config.size = bdev_nr_bytes(bdev);
> > > +     config.word_size = 1;
> > > +     config.stride = 1;
> > > +     config.read_only = true;
> > > +     config.root_only = true;
> > > +     config.ignore_wp = true;
> > > +     config.of_node = to_of_node(dev->fwnode);
> > > +
> > > +     bdev->bd_nvmem = nvmem_register(&config);
> > > +     if (IS_ERR(bdev->bd_nvmem)) {
> > > +             dev_err_probe(dev, PTR_ERR(bdev->bd_nvmem),
> > > +                           "Failed to register NVMEM device\n");
> >
> > Using dev_err_probe() only makes sense with a return value. Which makes me
> > think: we won't retry this after a probe deferral. I think we should return
>
> Yes, so here with the nvmem fixed-layout, there is no way to get a
> deferred probe error, but better to be ready to handle this anyway.
>
> > int from this function just for this use-case. Also: if we *do* have
> > a layout, shouldn't we treat a failure to register the nvmem provider as
> > a an error and propagate it up the stack?
>
> From an API perspective we should indeed return the error. From block
> core, Do we want to fail the entire disk addition just because the
> 'companion' NVMEM provider couldn't be registered, or should we only
> abort/return in case of EPROBE_DEFER?

Also we cannot safely return -EPROBE_DEFER from add_disk_final()
either. The NVMEM registration point is late in the sequence, too much
has already happened to easily unwind. The easiest is that the NVMEM
simply won't be available if registration fails, which looks
acceptable?

>
> >
> > > +             bdev->bd_nvmem = NULL;
> > > +     }
> > > +}
> > > +
> > > +void blk_nvmem_del(struct block_device *bdev)
> > > +{
> > > +     if (bdev->bd_nvmem)
> >
> > Nvmem core already performs a NULL check.
>
> Ok, thanks!
>
>
> >
> > > +             nvmem_unregister(bdev->bd_nvmem);
> > > +
> > > +     bdev->bd_nvmem = NULL;
> > > +}
> > > diff --git a/block/blk.h b/block/blk.h
> > > index ec4674cdf2ead4fd259ff5fc42401f591e684ee9..cd3c7ca723391c40be56f1dd4810e641b7c8a2b3 100644
> > > --- a/block/blk.h
> > > +++ b/block/blk.h
> > > @@ -757,4 +757,12 @@ static inline void blk_debugfs_unlock(struct request_queue *q,
> > >       memalloc_noio_restore(memflags);
> > >  }
> > >
> > > +#ifdef CONFIG_BLK_NVMEM
> > > +void blk_nvmem_add(struct block_device *bdev);
> > > +void blk_nvmem_del(struct block_device *bdev);
> > > +#else
> > > +static inline void blk_nvmem_add(struct block_device *bdev) {}
> > > +static inline void blk_nvmem_del(struct block_device *bdev) {}
> > > +#endif
> > > +
> > >  #endif /* BLK_INTERNAL_H */
> > > diff --git a/block/genhd.c b/block/genhd.c
> > > index 7d6854fd28e95ae9134309679a7c6a937f5b7db8..1b2382de6fb30c1e5f60f45c04dc03ed3bf5d5f2 100644
> > > --- a/block/genhd.c
> > > +++ b/block/genhd.c
> > > @@ -421,6 +421,8 @@ static void add_disk_final(struct gendisk *disk)
> > >                */
> > >               dev_set_uevent_suppress(ddev, 0);
> > >               disk_uevent(disk, KOBJ_ADD);
> > > +
> > > +             blk_nvmem_add(disk->part0);
> > >       }
> > >
> > >       blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
> > > @@ -704,6 +706,8 @@ static void __del_gendisk(struct gendisk *disk)
> > >
> > >       disk_del_events(disk);
> > >
> > > +     blk_nvmem_del(disk->part0);
> > > +
> > >       /*
> > >        * Prevent new openers by unlinked the bdev inode.
> > >        */
> > > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > > index 8808ee76e73c09e0ceaac41ba59e86fb0c4efc64..ace6f59b860d0813665b2f62a1c03a1f4be94059 100644
> > > --- a/include/linux/blk_types.h
> > > +++ b/include/linux/blk_types.h
> > > @@ -73,6 +73,9 @@ struct block_device {
> > >       int                     bd_writers;
> > >  #ifdef CONFIG_SECURITY
> > >       void                    *bd_security;
> > > +#endif
> > > +#ifdef CONFIG_BLK_NVMEM
> > > +     struct nvmem_device     *bd_nvmem;
> > >  #endif
> > >       /*
> > >        * keep this out-of-line as it's both big and not needed in the fast
> > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > > index 890128cdea1ce66863c5baa36f3b336ec4550807..f15d2b5bf9e4fd2368b8a70416a978e22c0d4333 100644
> > > --- a/include/linux/blkdev.h
> > > +++ b/include/linux/blkdev.h
> > > @@ -30,6 +30,7 @@
> > >
> > >  struct module;
> > >  struct request_queue;
> > > +struct nvmem_device;
> > >  struct elevator_queue;
> > >  struct blk_trace;
> > >  struct request;
> > >
> > > --
> > > 2.34.1
> > >
> > >
> >
> > I like this approach better than the previous one.
> >
> > Thanks,
> > Bartosz

^ permalink raw reply

* Re: [PATCH v5 5/9] block: implement NVMEM provider
From: Loic Poulain @ 2026-06-15  9:28 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
	linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
	Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
	Jeff Johnson, Marcel Holtmann, Luiz Augusto von Dentz,
	Balakrishna Godavarthi, Rocky Liao, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Srinivas Kandagatla,
	Andrew Lunn, Heiner Kallweit, Russell King, Saravana Kannan
In-Reply-To: <CAMRc=McQkLnz2OS2RREAbcrsp47cL-W3bCduq8LwPBBUcVNyJw@mail.gmail.com>

On Mon, Jun 15, 2026 at 10:53 AM Bartosz Golaszewski <brgl@kernel.org> wrote:
>
> On Fri, 12 Jun 2026 15:20:57 +0200, Loic Poulain
> <loic.poulain@oss.qualcomm.com> said:
> > From: Daniel Golle <daniel@makrotopia.org>
> >
> > On embedded devices using an eMMC it is common that one or more partitions
> > on the eMMC are used to store MAC addresses and Wi-Fi calibration EEPROM
> > data. Allow referencing the partition in device tree for the kernel and
> > Wi-Fi drivers accessing it via the NVMEM layer.
> >
> > For now, NVMEM is only registered for the whole disk block device, as the
> > OF node is currently only associated to it.
> >
> > Signed-off-by: Daniel Golle <daniel@makrotopia.org>
> > Co-developed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> > Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> > ---
> >  block/Kconfig             |   9 ++++
> >  block/Makefile            |   1 +
> >  block/blk-nvmem.c         | 109 ++++++++++++++++++++++++++++++++++++++++++++++
> >  block/blk.h               |   8 ++++
> >  block/genhd.c             |   4 ++
> >  include/linux/blk_types.h |   3 ++
> >  include/linux/blkdev.h    |   1 +
> >  7 files changed, 135 insertions(+)
> >
> > diff --git a/block/Kconfig b/block/Kconfig
> > index 15027963472d7b40e27b9097a5993c457b5b3054..0b33747e16dc33473683706f75c92bdf8b648f7c 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -209,6 +209,15 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
> >         by falling back to the kernel crypto API when inline
> >         encryption hardware is not present.
> >
> > +config BLK_NVMEM
> > +     bool "Block device NVMEM provider"
> > +     depends on OF
> > +     depends on NVMEM
> > +     help
> > +       Allow block devices (or partitions) to act as NVMEM providers,
> > +       typically used with eMMC to store MAC addresses or Wi-Fi
> > +       calibration data on embedded devices.
> > +
> >  source "block/partitions/Kconfig"
> >
> >  config BLK_PM
> > diff --git a/block/Makefile b/block/Makefile
> > index 7dce2e44276c4274c11a0a61121c83d9c43d6e0c..d7ac389e71902bc091a8800ea266190a43b3e63d 100644
> > --- a/block/Makefile
> > +++ b/block/Makefile
> > @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
> >                                          blk-crypto-sysfs.o
> >  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
> >  obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)        += holder.o
> > +obj-$(CONFIG_BLK_NVMEM)                += blk-nvmem.o
> > diff --git a/block/blk-nvmem.c b/block/blk-nvmem.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..c005f059d9fe56242ebaef9905673dff902b5686
> > --- /dev/null
> > +++ b/block/blk-nvmem.c
> > @@ -0,0 +1,109 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * block device NVMEM provider
> > + *
> > + * Copyright (c) 2024 Daniel Golle <daniel@makrotopia.org>
> > + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> > + *
> > + * Useful on devices using a partition on an eMMC for MAC addresses or
> > + * Wi-Fi calibration EEPROM data.
> > + */
> > +
> > +#include <linux/file.h>
> > +#include <linux/nvmem-provider.h>
> > +#include <linux/nvmem-consumer.h>
> > +#include <linux/of.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/property.h>
> > +
> > +#include "blk.h"
> > +
> > +static int blk_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes)
> > +{
> > +     blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES;
> > +     dev_t devt = (dev_t)(uintptr_t)priv;
> > +     size_t bytes_left = bytes;
> > +     loff_t pos = from;
> > +     int ret = 0;
> > +
> > +     struct file *bdev_file __free(fput) = bdev_file_open_by_dev(devt, mode, priv, NULL);
> > +     if (IS_ERR(bdev_file))
> > +             return PTR_ERR(bdev_file);
> > +
> > +     while (bytes_left) {
> > +             pgoff_t f_index = pos >> PAGE_SHIFT;
> > +             struct folio *folio;
> > +             size_t folio_off;
> > +             size_t to_read;
> > +
> > +             folio = read_mapping_folio(bdev_file->f_mapping, f_index, NULL);
> > +             if (IS_ERR(folio)) {
> > +                     ret = PTR_ERR(folio);
> > +                     break;
> > +             }
> > +
> > +             folio_off = offset_in_folio(folio, pos);
> > +             to_read = min(bytes_left, folio_size(folio) - folio_off);
> > +             memcpy_from_folio(val, folio, folio_off, to_read);
> > +             pos += to_read;
> > +             bytes_left -= to_read;
> > +             val += to_read;
> > +             folio_put(folio);
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> > +void blk_nvmem_add(struct block_device *bdev)
> > +{
> > +     struct device *dev = &bdev->bd_device;
> > +     struct nvmem_config config = {};
> > +
> > +     /* skip devices which do not have a device tree node */
> > +     if (!dev_of_node(dev))
> > +             return;
> > +
> > +     /* skip devices without an nvmem layout defined */
> > +     struct device_node *child __free(device_node) =
> > +             of_get_child_by_name(dev_of_node(dev), "nvmem-layout");
> > +     if (!child)
> > +             return;
> > +
> > +     /*
> > +      * skip block device too large to be represented as NVMEM devices,
> > +      * the NVMEM reg_read callback uses an unsigned int offset
> > +      */
> > +     if (bdev_nr_bytes(bdev) > UINT_MAX) {
> > +             dev_warn(dev, "block device too large to be an NVMEM provider\n");
> > +             return;
> > +     }
> > +
> > +     config.id = NVMEM_DEVID_NONE;
> > +     config.dev = dev;
> > +     config.name = dev_name(dev);
> > +     config.owner = THIS_MODULE;
> > +     config.priv = (void *)(uintptr_t)dev->devt;
> > +     config.reg_read = blk_nvmem_reg_read;
> > +     config.size = bdev_nr_bytes(bdev);
> > +     config.word_size = 1;
> > +     config.stride = 1;
> > +     config.read_only = true;
> > +     config.root_only = true;
> > +     config.ignore_wp = true;
> > +     config.of_node = to_of_node(dev->fwnode);
> > +
> > +     bdev->bd_nvmem = nvmem_register(&config);
> > +     if (IS_ERR(bdev->bd_nvmem)) {
> > +             dev_err_probe(dev, PTR_ERR(bdev->bd_nvmem),
> > +                           "Failed to register NVMEM device\n");
>
> Using dev_err_probe() only makes sense with a return value. Which makes me
> think: we won't retry this after a probe deferral. I think we should return

Yes, so here with the nvmem fixed-layout, there is no way to get a
deferred probe error, but better to be ready to handle this anyway.

> int from this function just for this use-case. Also: if we *do* have
> a layout, shouldn't we treat a failure to register the nvmem provider as
> a an error and propagate it up the stack?

From an API perspective we should indeed return the error. From block
core, Do we want to fail the entire disk addition just because the
'companion' NVMEM provider couldn't be registered, or should we only
abort/return in case of EPROBE_DEFER?

>
> > +             bdev->bd_nvmem = NULL;
> > +     }
> > +}
> > +
> > +void blk_nvmem_del(struct block_device *bdev)
> > +{
> > +     if (bdev->bd_nvmem)
>
> Nvmem core already performs a NULL check.

Ok, thanks!


>
> > +             nvmem_unregister(bdev->bd_nvmem);
> > +
> > +     bdev->bd_nvmem = NULL;
> > +}
> > diff --git a/block/blk.h b/block/blk.h
> > index ec4674cdf2ead4fd259ff5fc42401f591e684ee9..cd3c7ca723391c40be56f1dd4810e641b7c8a2b3 100644
> > --- a/block/blk.h
> > +++ b/block/blk.h
> > @@ -757,4 +757,12 @@ static inline void blk_debugfs_unlock(struct request_queue *q,
> >       memalloc_noio_restore(memflags);
> >  }
> >
> > +#ifdef CONFIG_BLK_NVMEM
> > +void blk_nvmem_add(struct block_device *bdev);
> > +void blk_nvmem_del(struct block_device *bdev);
> > +#else
> > +static inline void blk_nvmem_add(struct block_device *bdev) {}
> > +static inline void blk_nvmem_del(struct block_device *bdev) {}
> > +#endif
> > +
> >  #endif /* BLK_INTERNAL_H */
> > diff --git a/block/genhd.c b/block/genhd.c
> > index 7d6854fd28e95ae9134309679a7c6a937f5b7db8..1b2382de6fb30c1e5f60f45c04dc03ed3bf5d5f2 100644
> > --- a/block/genhd.c
> > +++ b/block/genhd.c
> > @@ -421,6 +421,8 @@ static void add_disk_final(struct gendisk *disk)
> >                */
> >               dev_set_uevent_suppress(ddev, 0);
> >               disk_uevent(disk, KOBJ_ADD);
> > +
> > +             blk_nvmem_add(disk->part0);
> >       }
> >
> >       blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
> > @@ -704,6 +706,8 @@ static void __del_gendisk(struct gendisk *disk)
> >
> >       disk_del_events(disk);
> >
> > +     blk_nvmem_del(disk->part0);
> > +
> >       /*
> >        * Prevent new openers by unlinked the bdev inode.
> >        */
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index 8808ee76e73c09e0ceaac41ba59e86fb0c4efc64..ace6f59b860d0813665b2f62a1c03a1f4be94059 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -73,6 +73,9 @@ struct block_device {
> >       int                     bd_writers;
> >  #ifdef CONFIG_SECURITY
> >       void                    *bd_security;
> > +#endif
> > +#ifdef CONFIG_BLK_NVMEM
> > +     struct nvmem_device     *bd_nvmem;
> >  #endif
> >       /*
> >        * keep this out-of-line as it's both big and not needed in the fast
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 890128cdea1ce66863c5baa36f3b336ec4550807..f15d2b5bf9e4fd2368b8a70416a978e22c0d4333 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -30,6 +30,7 @@
> >
> >  struct module;
> >  struct request_queue;
> > +struct nvmem_device;
> >  struct elevator_queue;
> >  struct blk_trace;
> >  struct request;
> >
> > --
> > 2.34.1
> >
> >
>
> I like this approach better than the previous one.
>
> Thanks,
> Bartosz

^ permalink raw reply

* Re: [PATCH 0/3] mm/zram: route block swap I/O through swap_ops
From: Barry Song @ 2026-06-15  9:14 UTC (permalink / raw)
  To: Jianyue Wu
  Cc: Andrew Morton, Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

On Sun, Jun 14, 2026 at 11:35 PM Jianyue Wu <wujianyue000@gmail.com> wrote:
>
> This series builds on Christoph Hellwig's swap batching rework that
> moves block swap onto struct swap_iocb and per-backend struct
> swap_ops handlers [1].  Christoph's patches unify batching for
> ordinary block devices and swap files.  zram still needs a custom
> path because swap slots map to compressed pages, not disk sectors.
>
> The first patch adds swap_register_block_ops() so a block driver can
> install custom submit_read/submit_write handlers when swapon targets
> its block device.  The default swap_bdev_ops path is unchanged for
> devices that do not register.
>
> The second patch registers zram_swap_ops at module init.  On write,
> the swap core still batches folios into a swap_iocb.  zram maps each
> folio to a slot index and stores it through zram_write_page() instead
> of building one bio per page.  Read handling keeps slot_lock and
> mark_slot_accessed() in one critical section.  Writeback-enabled zram
> falls back to swap_bdev_submit_read() for ZRAM_WB slots.
>
> The third patch moves slot_free_notify into swap_ops next to the
> other zram swap callbacks, and documents the locking contract for
> that hook.
>
> Applied on top of Christoph Hellwig's "better block swap batching and
> a different take on swap_ops" series [1].

Nice. I think it's better to mark it as RFC at this stage.

By the way, besides the architectural refinements, have
you also observed any noticeable performance improvements?

>
> [1] https://lore.kernel.org/linux-mm/?q=better+block+swap+batching

Best Regards
Barry

^ permalink raw reply

* [PATCH v1 2/2] virtio-blk: mark disk dead on ERS permanent failure
From: Xixin Liu @ 2026-06-12 10:00 UTC (permalink / raw)
  To: linux-block, virtualization
  Cc: mst, jasowang, xuanzhuo, eperezma, pbonzini, stefanha, axboe,
	linux-kernel, liuxixin
In-Reply-To: <cover.virtio-blk-ers-v1.1780449274.git.liuxixin@kylinos.cn>

After ERS reports pci_channel_io_perm_failure, virtio-pci must ask the
virtio driver to tear down the block device — not only mark virtqueues
broken.  Call the virtio driver shutdown hook from virtio-pci on
perm_failure; virtio-blk implements shutdown with blk_mark_disk_dead().
Fail new requests early in virtio_queue_rq when the disk is dead or
virtqueues were removed during frozen reset_prepare.

Signed-off-by: Xixin Liu <liuxixin@kylinos.cn>
---
 drivers/block/virtio_blk.c         | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci_common.c | 10 +++++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 32bf3ba07a9d..4740ae91d5be 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -435,6 +435,12 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_status_t status;
 	int err;
 
+	/* Fail fast if ERS frozen tore down VQs or the disk was marked dead. */
+	if (unlikely(!disk_live(vblk->disk) || !vblk->vqs || !vblk->vdev)) {
+		blk_mq_start_request(req);
+		return BLK_STS_IOERR;
+	}
+
 	status = virtblk_prep_rq(hctx, vblk, req, vbr);
 	if (unlikely(status))
 		return status;
@@ -1561,6 +1567,29 @@ static int virtblk_probe(struct virtio_device *vdev)
 	return err;
 }
 
+/* Stop I/O and mark the gendisk dead (ERS perm_failure or system shutdown). */
+static void virtblk_shutdown(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	struct request_queue *q;
+	unsigned int memflags;
+
+	if (!vblk || !vblk->disk)
+		return;
+
+	flush_work(&vblk->config_work);
+	virtio_break_device(vdev);
+
+	q = vblk->disk->queue;
+	memflags = blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue_nowait(q);
+
+	blk_mark_disk_dead(vblk->disk);
+
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q, memflags);
+}
+
 static void virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
@@ -1684,6 +1713,7 @@ static struct virtio_driver virtio_blk = {
 	.probe				= virtblk_probe,
 	.remove				= virtblk_remove,
 	.config_changed			= virtblk_config_changed,
+	.shutdown			= virtblk_shutdown,
 #ifdef CONFIG_PM_SLEEP
 	.freeze				= virtblk_freeze,
 	.restore			= virtblk_restore,
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index e2dda946e70e..924ceead436b 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -845,7 +845,15 @@ static pci_ers_result_t virtio_pci_error_detected(struct pci_dev *pci_dev,
 	case pci_channel_io_perm_failure:
 		dev_warn(&pci_dev->dev,
 			 "permanent failure, disconnecting device\n");
-		virtio_break_device(&vp_dev->vdev);
+		{
+			struct virtio_driver *drv =
+				drv_to_virtio(vp_dev->vdev.dev.driver);
+
+			if (drv && drv->shutdown)
+				drv->shutdown(&vp_dev->vdev);
+			else
+				virtio_break_device(&vp_dev->vdev);
+		}
 		return PCI_ERS_RESULT_DISCONNECT;
 	default:
 		break;


^ permalink raw reply related

* [PATCH v1 1/2] virtio-pci: add error_detected for PCI AER recovery
From: Xixin Liu @ 2026-06-10  6:20 UTC (permalink / raw)
  To: linux-block, virtualization
  Cc: mst, jasowang, xuanzhuo, eperezma, pbonzini, stefanha, axboe,
	linux-kernel, liuxixin
In-Reply-To: <cover.virtio-blk-ers-v1.1780449274.git.liuxixin@kylinos.cn>

virtio-pci only registered reset_prepare/reset_done.  The PCI error
recovery core treats devices without error_detected as NO_AER_DRIVER and
does not deliver pci_channel_io_perm_failure to the driver after a failed
recovery.  Virtio devices therefore miss the normal ERS quiesce/teardown
sequence.

Register error_detected: quiesce on frozen (reset_prepare) before bus
reset; on perm_failure break virtqueues and return DISCONNECT.  Block-layer
cleanup for virtio-blk is handled in the follow-up patch.

Signed-off-by: Xixin Liu <liuxixin@kylinos.cn>
---
 drivers/virtio/virtio_pci_common.c | 30 +++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 164f480b18a6..e2dda946e70e 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -828,7 +828,37 @@ static void virtio_pci_reset_done(struct pci_dev *pci_dev)
 		dev_warn(&pci_dev->dev, "Reset done failure: %d", ret);
 }
 
+static pci_ers_result_t virtio_pci_error_detected(struct pci_dev *pci_dev,
+						  pci_channel_state_t state)
+{
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+
+	/*
+	 * PCI ERS error_detected: quiesce on frozen before bus reset; on
+	 * permanent failure ask the virtio driver to shut down (virtio-blk
+	 * marks the disk dead in its .shutdown handler).
+	 */
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		pci_info(pci_dev, "frozen error detected, quiesce device\n");
+		if (virtio_device_reset_prepare(&vp_dev->vdev))
+			dev_warn(&pci_dev->dev, "frozen: reset prepare failed\n");
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(&pci_dev->dev,
+			 "permanent failure, disconnecting device\n");
+		virtio_break_device(&vp_dev->vdev);
+		return PCI_ERS_RESULT_DISCONNECT;
+	default:
+		break;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
 static const struct pci_error_handlers virtio_pci_err_handler = {
+	.error_detected = virtio_pci_error_detected,
 	.reset_prepare  = virtio_pci_reset_prepare,
 	.reset_done     = virtio_pci_reset_done,
 };


^ permalink raw reply related

* [PATCH v1 0/2] virtio: PCI ERS permanent failure teardown for virtio-blk
From: Xixin Liu @ 2026-06-15  2:00 UTC (permalink / raw)
  To: linux-block, virtualization
  Cc: mst, jasowang, xuanzhuo, eperezma, pbonzini, stefanha, axboe,
	linux-kernel, liuxixin

Hi,

This series adds proper PCI AER error recovery handling for virtio-pci and
completes virtio-blk teardown when ERS reports pci_channel_io_perm_failure.

virtio-pci only registered reset_prepare/reset_done.  The recovery core
treats devices without error_detected as NO_AER_DRIVER and does not
deliver perm_failure to the driver after a failed recovery.  When bus
reset fails (reproduced on QEMU with DLLLA not set within 100 ms after
secondary bus reset), virtio-blk disks stay live even though virtqueues
may already have been torn down during the frozen phase.

Patch 1 registers error_detected (frozen quiesce + perm_failure notify).
Patch 2 calls the virtio driver shutdown hook from virtio-pci on
perm_failure, implements virtio-blk shutdown with blk_mark_disk_dead(),
and fail-fast guards in virtio_queue_rq.

Thanks,
Xixin Liu

---

Xixin Liu (2):
  virtio-pci: add error_detected for PCI AER recovery
  virtio-blk: mark disk dead on ERS permanent failure

 drivers/block/virtio_blk.c         | 39 +++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci_common.c | 47 ++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH v5 5/9] block: implement NVMEM provider
From: Bartosz Golaszewski @ 2026-06-15  8:53 UTC (permalink / raw)
  To: Loic Poulain
  Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
	linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
	Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
	Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
	Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
	Russell King, Saravana Kannan
In-Reply-To: <20260612-block-as-nvmem-v5-5-95e0b30fff90@oss.qualcomm.com>

On Fri, 12 Jun 2026 15:20:57 +0200, Loic Poulain
<loic.poulain@oss.qualcomm.com> said:
> From: Daniel Golle <daniel@makrotopia.org>
>
> On embedded devices using an eMMC it is common that one or more partitions
> on the eMMC are used to store MAC addresses and Wi-Fi calibration EEPROM
> data. Allow referencing the partition in device tree for the kernel and
> Wi-Fi drivers accessing it via the NVMEM layer.
>
> For now, NVMEM is only registered for the whole disk block device, as the
> OF node is currently only associated to it.
>
> Signed-off-by: Daniel Golle <daniel@makrotopia.org>
> Co-developed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> ---
>  block/Kconfig             |   9 ++++
>  block/Makefile            |   1 +
>  block/blk-nvmem.c         | 109 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/blk.h               |   8 ++++
>  block/genhd.c             |   4 ++
>  include/linux/blk_types.h |   3 ++
>  include/linux/blkdev.h    |   1 +
>  7 files changed, 135 insertions(+)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 15027963472d7b40e27b9097a5993c457b5b3054..0b33747e16dc33473683706f75c92bdf8b648f7c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -209,6 +209,15 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
>  	  by falling back to the kernel crypto API when inline
>  	  encryption hardware is not present.
>
> +config BLK_NVMEM
> +	bool "Block device NVMEM provider"
> +	depends on OF
> +	depends on NVMEM
> +	help
> +	  Allow block devices (or partitions) to act as NVMEM providers,
> +	  typically used with eMMC to store MAC addresses or Wi-Fi
> +	  calibration data on embedded devices.
> +
>  source "block/partitions/Kconfig"
>
>  config BLK_PM
> diff --git a/block/Makefile b/block/Makefile
> index 7dce2e44276c4274c11a0a61121c83d9c43d6e0c..d7ac389e71902bc091a8800ea266190a43b3e63d 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
>  					   blk-crypto-sysfs.o
>  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
>  obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
> +obj-$(CONFIG_BLK_NVMEM)                += blk-nvmem.o
> diff --git a/block/blk-nvmem.c b/block/blk-nvmem.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c005f059d9fe56242ebaef9905673dff902b5686
> --- /dev/null
> +++ b/block/blk-nvmem.c
> @@ -0,0 +1,109 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * block device NVMEM provider
> + *
> + * Copyright (c) 2024 Daniel Golle <daniel@makrotopia.org>
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + *
> + * Useful on devices using a partition on an eMMC for MAC addresses or
> + * Wi-Fi calibration EEPROM data.
> + */
> +
> +#include <linux/file.h>
> +#include <linux/nvmem-provider.h>
> +#include <linux/nvmem-consumer.h>
> +#include <linux/of.h>
> +#include <linux/pagemap.h>
> +#include <linux/property.h>
> +
> +#include "blk.h"
> +
> +static int blk_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes)
> +{
> +	blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES;
> +	dev_t devt = (dev_t)(uintptr_t)priv;
> +	size_t bytes_left = bytes;
> +	loff_t pos = from;
> +	int ret = 0;
> +
> +	struct file *bdev_file __free(fput) = bdev_file_open_by_dev(devt, mode, priv, NULL);
> +	if (IS_ERR(bdev_file))
> +		return PTR_ERR(bdev_file);
> +
> +	while (bytes_left) {
> +		pgoff_t f_index = pos >> PAGE_SHIFT;
> +		struct folio *folio;
> +		size_t folio_off;
> +		size_t to_read;
> +
> +		folio = read_mapping_folio(bdev_file->f_mapping, f_index, NULL);
> +		if (IS_ERR(folio)) {
> +			ret = PTR_ERR(folio);
> +			break;
> +		}
> +
> +		folio_off = offset_in_folio(folio, pos);
> +		to_read = min(bytes_left, folio_size(folio) - folio_off);
> +		memcpy_from_folio(val, folio, folio_off, to_read);
> +		pos += to_read;
> +		bytes_left -= to_read;
> +		val += to_read;
> +		folio_put(folio);
> +	}
> +
> +	return ret;
> +}
> +
> +void blk_nvmem_add(struct block_device *bdev)
> +{
> +	struct device *dev = &bdev->bd_device;
> +	struct nvmem_config config = {};
> +
> +	/* skip devices which do not have a device tree node */
> +	if (!dev_of_node(dev))
> +		return;
> +
> +	/* skip devices without an nvmem layout defined */
> +	struct device_node *child __free(device_node) =
> +		of_get_child_by_name(dev_of_node(dev), "nvmem-layout");
> +	if (!child)
> +		return;
> +
> +	/*
> +	 * skip block device too large to be represented as NVMEM devices,
> +	 * the NVMEM reg_read callback uses an unsigned int offset
> +	 */
> +	if (bdev_nr_bytes(bdev) > UINT_MAX) {
> +		dev_warn(dev, "block device too large to be an NVMEM provider\n");
> +		return;
> +	}
> +
> +	config.id = NVMEM_DEVID_NONE;
> +	config.dev = dev;
> +	config.name = dev_name(dev);
> +	config.owner = THIS_MODULE;
> +	config.priv = (void *)(uintptr_t)dev->devt;
> +	config.reg_read = blk_nvmem_reg_read;
> +	config.size = bdev_nr_bytes(bdev);
> +	config.word_size = 1;
> +	config.stride = 1;
> +	config.read_only = true;
> +	config.root_only = true;
> +	config.ignore_wp = true;
> +	config.of_node = to_of_node(dev->fwnode);
> +
> +	bdev->bd_nvmem = nvmem_register(&config);
> +	if (IS_ERR(bdev->bd_nvmem)) {
> +		dev_err_probe(dev, PTR_ERR(bdev->bd_nvmem),
> +			      "Failed to register NVMEM device\n");

Using dev_err_probe() only makes sense with a return value. Which makes me
think: we won't retry this after a probe deferral. I think we should return
int from this function just for this use-case. Also: if we *do* have
a layout, shouldn't we treat a failure to register the nvmem provider as
a an error and propagate it up the stack?

> +		bdev->bd_nvmem = NULL;
> +	}
> +}
> +
> +void blk_nvmem_del(struct block_device *bdev)
> +{
> +	if (bdev->bd_nvmem)

Nvmem core already performs a NULL check.

> +		nvmem_unregister(bdev->bd_nvmem);
> +
> +	bdev->bd_nvmem = NULL;
> +}
> diff --git a/block/blk.h b/block/blk.h
> index ec4674cdf2ead4fd259ff5fc42401f591e684ee9..cd3c7ca723391c40be56f1dd4810e641b7c8a2b3 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -757,4 +757,12 @@ static inline void blk_debugfs_unlock(struct request_queue *q,
>  	memalloc_noio_restore(memflags);
>  }
>
> +#ifdef CONFIG_BLK_NVMEM
> +void blk_nvmem_add(struct block_device *bdev);
> +void blk_nvmem_del(struct block_device *bdev);
> +#else
> +static inline void blk_nvmem_add(struct block_device *bdev) {}
> +static inline void blk_nvmem_del(struct block_device *bdev) {}
> +#endif
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/block/genhd.c b/block/genhd.c
> index 7d6854fd28e95ae9134309679a7c6a937f5b7db8..1b2382de6fb30c1e5f60f45c04dc03ed3bf5d5f2 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -421,6 +421,8 @@ static void add_disk_final(struct gendisk *disk)
>  		 */
>  		dev_set_uevent_suppress(ddev, 0);
>  		disk_uevent(disk, KOBJ_ADD);
> +
> +		blk_nvmem_add(disk->part0);
>  	}
>
>  	blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
> @@ -704,6 +706,8 @@ static void __del_gendisk(struct gendisk *disk)
>
>  	disk_del_events(disk);
>
> +	blk_nvmem_del(disk->part0);
> +
>  	/*
>  	 * Prevent new openers by unlinked the bdev inode.
>  	 */
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 8808ee76e73c09e0ceaac41ba59e86fb0c4efc64..ace6f59b860d0813665b2f62a1c03a1f4be94059 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -73,6 +73,9 @@ struct block_device {
>  	int			bd_writers;
>  #ifdef CONFIG_SECURITY
>  	void			*bd_security;
> +#endif
> +#ifdef CONFIG_BLK_NVMEM
> +	struct nvmem_device	*bd_nvmem;
>  #endif
>  	/*
>  	 * keep this out-of-line as it's both big and not needed in the fast
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 890128cdea1ce66863c5baa36f3b336ec4550807..f15d2b5bf9e4fd2368b8a70416a978e22c0d4333 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -30,6 +30,7 @@
>
>  struct module;
>  struct request_queue;
> +struct nvmem_device;
>  struct elevator_queue;
>  struct blk_trace;
>  struct request;
>
> --
> 2.34.1
>
>

I like this approach better than the previous one.

Thanks,
Bartosz

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox