Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* Re: [PATCH v5 5/9] block: implement NVMEM provider
From: Bartosz Golaszewski @ 2026-06-15  8:53 UTC (permalink / raw)
  To: Loic Poulain
  Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
	linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
	Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
	Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
	Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
	Russell King, Saravana Kannan
In-Reply-To: <20260612-block-as-nvmem-v5-5-95e0b30fff90@oss.qualcomm.com>

On Fri, 12 Jun 2026 15:20:57 +0200, Loic Poulain
<loic.poulain@oss.qualcomm.com> said:
> From: Daniel Golle <daniel@makrotopia.org>
>
> On embedded devices using an eMMC it is common that one or more partitions
> on the eMMC are used to store MAC addresses and Wi-Fi calibration EEPROM
> data. Allow referencing the partition in device tree for the kernel and
> Wi-Fi drivers accessing it via the NVMEM layer.
>
> For now, NVMEM is only registered for the whole disk block device, as the
> OF node is currently only associated to it.
>
> Signed-off-by: Daniel Golle <daniel@makrotopia.org>
> Co-developed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> ---
>  block/Kconfig             |   9 ++++
>  block/Makefile            |   1 +
>  block/blk-nvmem.c         | 109 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/blk.h               |   8 ++++
>  block/genhd.c             |   4 ++
>  include/linux/blk_types.h |   3 ++
>  include/linux/blkdev.h    |   1 +
>  7 files changed, 135 insertions(+)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 15027963472d7b40e27b9097a5993c457b5b3054..0b33747e16dc33473683706f75c92bdf8b648f7c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -209,6 +209,15 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
>  	  by falling back to the kernel crypto API when inline
>  	  encryption hardware is not present.
>
> +config BLK_NVMEM
> +	bool "Block device NVMEM provider"
> +	depends on OF
> +	depends on NVMEM
> +	help
> +	  Allow block devices (or partitions) to act as NVMEM providers,
> +	  typically used with eMMC to store MAC addresses or Wi-Fi
> +	  calibration data on embedded devices.
> +
>  source "block/partitions/Kconfig"
>
>  config BLK_PM
> diff --git a/block/Makefile b/block/Makefile
> index 7dce2e44276c4274c11a0a61121c83d9c43d6e0c..d7ac389e71902bc091a8800ea266190a43b3e63d 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
>  					   blk-crypto-sysfs.o
>  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
>  obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
> +obj-$(CONFIG_BLK_NVMEM)                += blk-nvmem.o
> diff --git a/block/blk-nvmem.c b/block/blk-nvmem.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c005f059d9fe56242ebaef9905673dff902b5686
> --- /dev/null
> +++ b/block/blk-nvmem.c
> @@ -0,0 +1,109 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * block device NVMEM provider
> + *
> + * Copyright (c) 2024 Daniel Golle <daniel@makrotopia.org>
> + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
> + *
> + * Useful on devices using a partition on an eMMC for MAC addresses or
> + * Wi-Fi calibration EEPROM data.
> + */
> +
> +#include <linux/file.h>
> +#include <linux/nvmem-provider.h>
> +#include <linux/nvmem-consumer.h>
> +#include <linux/of.h>
> +#include <linux/pagemap.h>
> +#include <linux/property.h>
> +
> +#include "blk.h"
> +
> +static int blk_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes)
> +{
> +	blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES;
> +	dev_t devt = (dev_t)(uintptr_t)priv;
> +	size_t bytes_left = bytes;
> +	loff_t pos = from;
> +	int ret = 0;
> +
> +	struct file *bdev_file __free(fput) = bdev_file_open_by_dev(devt, mode, priv, NULL);
> +	if (IS_ERR(bdev_file))
> +		return PTR_ERR(bdev_file);
> +
> +	while (bytes_left) {
> +		pgoff_t f_index = pos >> PAGE_SHIFT;
> +		struct folio *folio;
> +		size_t folio_off;
> +		size_t to_read;
> +
> +		folio = read_mapping_folio(bdev_file->f_mapping, f_index, NULL);
> +		if (IS_ERR(folio)) {
> +			ret = PTR_ERR(folio);
> +			break;
> +		}
> +
> +		folio_off = offset_in_folio(folio, pos);
> +		to_read = min(bytes_left, folio_size(folio) - folio_off);
> +		memcpy_from_folio(val, folio, folio_off, to_read);
> +		pos += to_read;
> +		bytes_left -= to_read;
> +		val += to_read;
> +		folio_put(folio);
> +	}
> +
> +	return ret;
> +}
> +
> +void blk_nvmem_add(struct block_device *bdev)
> +{
> +	struct device *dev = &bdev->bd_device;
> +	struct nvmem_config config = {};
> +
> +	/* skip devices which do not have a device tree node */
> +	if (!dev_of_node(dev))
> +		return;
> +
> +	/* skip devices without an nvmem layout defined */
> +	struct device_node *child __free(device_node) =
> +		of_get_child_by_name(dev_of_node(dev), "nvmem-layout");
> +	if (!child)
> +		return;
> +
> +	/*
> +	 * skip block device too large to be represented as NVMEM devices,
> +	 * the NVMEM reg_read callback uses an unsigned int offset
> +	 */
> +	if (bdev_nr_bytes(bdev) > UINT_MAX) {
> +		dev_warn(dev, "block device too large to be an NVMEM provider\n");
> +		return;
> +	}
> +
> +	config.id = NVMEM_DEVID_NONE;
> +	config.dev = dev;
> +	config.name = dev_name(dev);
> +	config.owner = THIS_MODULE;
> +	config.priv = (void *)(uintptr_t)dev->devt;
> +	config.reg_read = blk_nvmem_reg_read;
> +	config.size = bdev_nr_bytes(bdev);
> +	config.word_size = 1;
> +	config.stride = 1;
> +	config.read_only = true;
> +	config.root_only = true;
> +	config.ignore_wp = true;
> +	config.of_node = to_of_node(dev->fwnode);
> +
> +	bdev->bd_nvmem = nvmem_register(&config);
> +	if (IS_ERR(bdev->bd_nvmem)) {
> +		dev_err_probe(dev, PTR_ERR(bdev->bd_nvmem),
> +			      "Failed to register NVMEM device\n");

Using dev_err_probe() only makes sense with a return value. Which makes me
think: we won't retry this after a probe deferral. I think we should return
int from this function just for this use-case. Also: if we *do* have
a layout, shouldn't we treat a failure to register the nvmem provider as
a an error and propagate it up the stack?

> +		bdev->bd_nvmem = NULL;
> +	}
> +}
> +
> +void blk_nvmem_del(struct block_device *bdev)
> +{
> +	if (bdev->bd_nvmem)

Nvmem core already performs a NULL check.

> +		nvmem_unregister(bdev->bd_nvmem);
> +
> +	bdev->bd_nvmem = NULL;
> +}
> diff --git a/block/blk.h b/block/blk.h
> index ec4674cdf2ead4fd259ff5fc42401f591e684ee9..cd3c7ca723391c40be56f1dd4810e641b7c8a2b3 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -757,4 +757,12 @@ static inline void blk_debugfs_unlock(struct request_queue *q,
>  	memalloc_noio_restore(memflags);
>  }
>
> +#ifdef CONFIG_BLK_NVMEM
> +void blk_nvmem_add(struct block_device *bdev);
> +void blk_nvmem_del(struct block_device *bdev);
> +#else
> +static inline void blk_nvmem_add(struct block_device *bdev) {}
> +static inline void blk_nvmem_del(struct block_device *bdev) {}
> +#endif
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/block/genhd.c b/block/genhd.c
> index 7d6854fd28e95ae9134309679a7c6a937f5b7db8..1b2382de6fb30c1e5f60f45c04dc03ed3bf5d5f2 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -421,6 +421,8 @@ static void add_disk_final(struct gendisk *disk)
>  		 */
>  		dev_set_uevent_suppress(ddev, 0);
>  		disk_uevent(disk, KOBJ_ADD);
> +
> +		blk_nvmem_add(disk->part0);
>  	}
>
>  	blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
> @@ -704,6 +706,8 @@ static void __del_gendisk(struct gendisk *disk)
>
>  	disk_del_events(disk);
>
> +	blk_nvmem_del(disk->part0);
> +
>  	/*
>  	 * Prevent new openers by unlinked the bdev inode.
>  	 */
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 8808ee76e73c09e0ceaac41ba59e86fb0c4efc64..ace6f59b860d0813665b2f62a1c03a1f4be94059 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -73,6 +73,9 @@ struct block_device {
>  	int			bd_writers;
>  #ifdef CONFIG_SECURITY
>  	void			*bd_security;
> +#endif
> +#ifdef CONFIG_BLK_NVMEM
> +	struct nvmem_device	*bd_nvmem;
>  #endif
>  	/*
>  	 * keep this out-of-line as it's both big and not needed in the fast
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 890128cdea1ce66863c5baa36f3b336ec4550807..f15d2b5bf9e4fd2368b8a70416a978e22c0d4333 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -30,6 +30,7 @@
>
>  struct module;
>  struct request_queue;
> +struct nvmem_device;
>  struct elevator_queue;
>  struct blk_trace;
>  struct request;
>
> --
> 2.34.1
>
>

I like this approach better than the previous one.

Thanks,
Bartosz

^ permalink raw reply

* Re: [PATCH v5 1/9] block: partitions: of: Skip child nodes without reg property
From: Bartosz Golaszewski @ 2026-06-15  8:47 UTC (permalink / raw)
  To: Loic Poulain
  Cc: linux-mmc, devicetree, linux-kernel, linux-arm-msm, linux-block,
	linux-wireless, ath10k, linux-bluetooth, netdev, daniel,
	Ulf Hansson, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Bjorn Andersson, Konrad Dybcio, Jens Axboe, Johannes Berg,
	Jeff Johnson, Bartosz Golaszewski, Marcel Holtmann,
	Luiz Augusto von Dentz, Balakrishna Godavarthi, Rocky Liao,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Srinivas Kandagatla, Andrew Lunn, Heiner Kallweit,
	Russell King, Saravana Kannan
In-Reply-To: <20260612-block-as-nvmem-v5-1-95e0b30fff90@oss.qualcomm.com>

On Fri, 12 Jun 2026 15:20:53 +0200, Loic Poulain
<loic.poulain@oss.qualcomm.com> said:
> Child nodes of a fixed-partitions node are not necessarily partition
> entries, for example an nvmem-layout node has no reg property. The
> current code passes a NULL reg pointer and uninitialized len to the
> length check, which can result in a kernel panic or silent failure to
> register any partitions.
>
> Fix validate_of_partition() to return a skip indicator when no reg
> property is present. Guard add_of_partition() with a reg property
> check for the same reason.
>
> Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> ---

I think this warrants a Cc: stable and backporting as well as a Fixes tag.

Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>

^ permalink raw reply

* Re: [PATCH] nbd: Reclassify sockets to avoid lockdep circular dependency
From: Eric Dumazet @ 2026-06-15  7:53 UTC (permalink / raw)
  To: Hillf Danton
  Cc: linux-kernel, Jens Axboe, linux-block, nbd, Kuniyuki Iwashima,
	netdev, syzbot+607cdcf978b3e79da878
In-Reply-To: <20260613101214.1771-1-hdanton@sina.com>

On Sat, Jun 13, 2026 at 3:12 AM Hillf Danton <hdanton@sina.com> wrote:
>
> On Sat, 13 Jun 2026 04:26:19 +0000 Eric Dumazet wrote:
> > syzbot reported a possible circular locking dependency in udp_sendmsg()
> > where fs_reclaim can be triggered while holding sk_lock, and fs_reclaim
> > can eventually depend on another sk_lock (e.g., if NBD is used for swap
> > or writeback and NBD uses TLS/TCP which acquires sk_lock).
> >
> > Since the UDP socket and the NBD TCP/TLS socket are different, this is a
> > false positive. Fix this by reclassifying NBD sockets to a separate lock
> > class when they are added to the NBD device.
> >
> > This is similar to what nvme-tcp and other network block devices do.
> >
> > Fixes: ffa1e7ada456 ("block: Make request_queue lockdep splats show up earlier")
>
> Given the Fixes tag, can you specify anything wrong that commit added?

Nothing 'wrong'.

This (good) commit allowed LOCKDEP to throw a warning and eventually
panic the box.

A Fixes: tag does not imply the patch was wrong.

^ permalink raw reply

* Re: [PATCH 2/3] mm/zram: handle swap read/write via swap_ops
From: YoungJun Park @ 2026-06-15  6:39 UTC (permalink / raw)
  To: Jianyue Wu
  Cc: Andrew Morton, Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham,
	Barry Song, Kairui Song, Kemeng Shi, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-2-6c1a6639c222@gmail.com>

On Sun, Jun 14, 2026 at 11:35:30PM +0800, Jianyue Wu wrote:

Hello!

> +static void zram_swap_submit_read(struct swap_io_ctx *ctx)
> +{
> +	struct zram *zram = ctx->sis->bdev->bd_disk->private_data;

A passing thought. accessing `zram` here is too indirect. We might
need a `private_data` in the swap device struct someday?

(And If there is a real value like some swap-side only private data really needed.)

> +	struct swap_iocb *sio = ctx->sio;
> +	int nr = swap_iocb_nr_folios(sio);
> +	bool failed = false;
> +	int i, j;
> +			/*
> +			 * read_from_zspool() and mark_slot_accessed() must run
> +			 * under the same slot_lock.  zram_read_page() unlocks
> +			 * before returning, which leaves a window where
> +			 * writeback can pick an idle slot we just read.
> +			 */

Regarding the comment about the "window" where writeback can pick an
idle slot. I think this reasoning is a bit of a gray area. Writeback
could just as easily pick the slot right before entering this routine,
so the race condition seems fundamentally the same.

Isn't the actual justification here to separate the non-backend logic
and ensure mark_slot_accessed() is called under the lock, given that
zram_read_page() can call the backend device?

If the "window" mentioned in the comment is indeed a valid issue, then
zram_read_page() has the exact same problem and needs to be fixed as
well?

If not, IMHO I suggest revising or removing this comment to clarify
the true(?) intention. :)

> +			slot_lock(zram, idx);
> +			ret = read_from_zspool(zram, page, idx);
> +			if (!ret)
> +				mark_slot_accessed(zram, idx);
> +			slot_unlock(zram, idx);

^ permalink raw reply

* Re: [PATCH 1/3] mm/page_io: let block drivers register custom swap I/O ops
From: YoungJun Park @ 2026-06-15  1:50 UTC (permalink / raw)
  To: Jianyue Wu
  Cc: Andrew Morton, Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham,
	Barry Song, Kairui Song, Kemeng Shi, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-1-6c1a6639c222@gmail.com>

On Sun, Jun 14, 2026 at 11:35:29PM +0800, Jianyue Wu wrote:

...

Hello Jianyue.

Currently, the patch commit log indicates only a single custom swap
registration is supported. Shouldn't we allow multiple block drivers to
register their custom ops simultaneously from the beginning?

>  int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
>  		struct list_head *folio_list);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 284eebc40a70..ebdc96092961 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -2849,6 +2849,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
>  	sis->ops = &swap_bdev_ops;
>
>  	if (S_ISBLK(inode->i_mode)) {
> +		const struct swap_ops *block_ops = lookup_swap_block_ops(sis);

Also, just a personal thought on this part.

Instead of using `block_device_fops` as a lookup key, what if we handle
this similarly to how filesystems use the `a_ops->swap_activate` callback?

We could add a `swap_activate` callback directly into
struct block_device_operations (zram's zram_devops). This way, the
block device itself can set up and replace the swap `ops` directly without
needing a separate registration/lookup mechanism.

What are your thoughts on this approach?

Thanks,
Youngjun Park

^ permalink raw reply

* [PATCH blktests] scsi/009: fix unset bytes_to_write in TEST 8
From: Sebastian Chlad @ 2026-06-14 18:16 UTC (permalink / raw)
  To: shinichiro.kawasaki, linux-block; +Cc: Sebastian Chlad

bytes_to_write was never assigned before TEST 8, causing it to pass for
the wrong reason. Set it to atomic_unit_max_bytes + logical_block_size
and update the golden output with the expected "pwrite: Invalid argument"
from xfs_io.

Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
---

This is a followup on: https://github.com/linux-blktests/blktests/pull/245

 tests/scsi/009     | 1 +
 tests/scsi/009.out | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/scsi/009 b/tests/scsi/009
index 41a5152..c7a1754 100755
--- a/tests/scsi/009
+++ b/tests/scsi/009
@@ -143,6 +143,7 @@ test_device() {
 
 	test_desc="TEST 8 - perform a pwritev2 with size of sysfs_atomic_unit_max_bytes + 512 "
 	test_desc+="bytes with RWF_ATOMIC flag - pwritev2 should not be succesful"
+	bytes_to_write=$(( sysfs_atomic_unit_max_bytes + sysfs_logical_block_size ))
 	bytes_written=$(run_xfs_io_pwritev2_atomic "$TEST_DEV" "$bytes_to_write")
 	if [ "$bytes_written" = "" ]
 	then
diff --git a/tests/scsi/009.out b/tests/scsi/009.out
index e94882d..6c3780f 100644
--- a/tests/scsi/009.out
+++ b/tests/scsi/009.out
@@ -6,6 +6,7 @@ TEST 4 - check sysfs atomic_write_unit_min_bytes = scsi_debug atomic_wr_gran - p
 TEST 5 - check statx stx_atomic_write_unit_min - pass
 TEST 6 - check statx stx_atomic_write_unit_max - pass
 TEST 7 - perform a pwritev2 with size of sysfs_atomic_unit_max_bytes with RWF_ATOMIC flag - pwritev2 should be succesful - pass
+pwrite: Invalid argument
 TEST 8 - perform a pwritev2 with size of sysfs_atomic_unit_max_bytes + 512 bytes with RWF_ATOMIC flag - pwritev2 should not be succesful - pass
 TEST 9 - perform a pwritev2 with size of sysfs_atomic_unit_min_bytes with RWF_ATOMIC flag - pwritev2 should be succesful - pass
 pwrite: Invalid argument
-- 
2.51.0


^ permalink raw reply related

* Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-14 17:57 UTC (permalink / raw)
  To: linux-block, dm-devel

Hi,
  I've got a repeatable raid hang/warn and would appreciate some pointers
as where to debug.
  (I've been logging stuff on  https://bugzilla.kernel.org/show_bug.cgi?id=221535 )

  This started off as debugging a case where I'd get my RAID1 (on the host)
getting a reliable 'rescheduling sector'/disk failure while running the qemu block test suite
during a qemu build, but then I tried to build a smaller discrete
test, and now I've got a simply triggerable warn and test hang.
There's no errors from the underlying SATA layer on the storage,
everything resyncs just fine.

I've got an existing LVM vg ('main') with two mirrors on sda2, and sdb2
which are SATA disks.

# lvcreate --type mirror --mirrors 1 -L 1G main /dev/sda2 /dev/sdb2
# mkfs.ext4 /dev/mapper/main-lvol0
# mount /dev/mapper/main-lvol0 /mnt/tmp/
# chmod a+rwx /mnt/tmp

$ dd if=/dev/zero of=/mnt/tmp/testfile bs=1024k count=1

(I then wait for the IO to stop)

then we've got this little test program:

<--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->
#include <errno.h>
#include <fcntl.h>             
#include <asm-generic/fcntl.h>
#include <stdio.h> 
#include <unistd.h>


const char* path="/mnt/tmp/testfile";
static char buf[8192];

int main()                                       
{
  int fd=open(path, O_RDWR|O_DIRECT|O_CLOEXEC);
    
  errno=0;
  int res3=pread(fd, buf, 4096, 0);
  printf("pread of 4096 said: %d (%m)\n", res3);

}
<--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><--><-->

running that, either hangs or gets a 'pread of 4096 said: -1 (Input/output error)'
when it hangs it's unkillable.

at the moment (on 7.1.0-rc7) this is giving:
Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369

(full backtrace below)
(Note there is a moan in there about sdb IO error - repeated a lot - but
again, there's no SATA level errors, and the drive is fine on smart, and
I can read the whole of the underlying lvm mirrors, so I don't think it's
physically there).

I did a blktrace, although that gives me a 23G blkparse output, hmm
(I see each event repeated a lot - maybe per thread?)

252,26  15        1     0.000000000  3435  Q  RS 264192 + 8 [dbf]
  252,26 is /dev/mapper/main-lvol0
252,24  15        1     0.000005501  3435  A  RS 264192 + 8 <- (252,26) 264192
  252,24 is main-lvol0_mimage_0
252,24  15        2     0.000005761  3435  Q  RS 264192 + 8 [dbf]
  8,0   15        1     0.000008646  3435  A  RS 71634944 + 8 <- (252,24) 264192
    so that's sda 
  8,0   15        2     0.000008787  3435  A  RS 73734144 + 8 <- (8,2) 71634944
    I guess mapping down from sda2 to sda
  8,0   15        3     0.000009037  3435  Q  RS 73734144 + 8 [dbf]
  8,0   15        4     0.000009809  3435  C  RS 73734144 + 8 [65514]
      ??? Hmm what's the 65514 there?
252,24  15        3     0.000010320  3435  C  RS 264192 + 8 [65514]
252,25  15        1     0.000290384   369  Q   R 264192 + 8 [kworker/15:1]
   252,25 is main-lvol0_mimage_1

and at this point I'm a bit lost as to what I'm looking for.

Hints appreciated!

(I don't believe this is a regression - or at least not recent)

Dave




Jun 14 18:08:32 dalek kernel: device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
Jun 14 18:08:32 dalek dmeventd[1010]: Primary mirror device 252:24 read failed.
Jun 14 18:08:32 dalek kernel: WARNING: block/bio.c:1044 at bio_add_page+0x18b/0x250, CPU#15: kworker/15:1/369
Jun 14 18:08:32 dalek dmeventd[1010]: main-lvol0 is now in-sync.
Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Not tainted 7.1.0-rc7+ #786 PREEMPT(lazy) 
Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
Jun 14 18:08:32 dalek kernel: RIP: 0010:bio_add_page+0x18b/0x250
Jun 14 18:08:32 dalek kernel: Code: 24 10 4c 8b 04 24 84 c0 0f 85 c9 00 00 00 41 0f b7 40 78 48 8b 74 24 08 8b 4c 24 14 e9 b4 fe ff ff 0f 0b 31 c0 e9 55 d1 af 00 <0f> 0b eb f5 48 8b 7f 08 83 7f 60 05 0f 85 00 ff ff ff 49 8b 3b 4c
Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176fc10 EFLAGS: 00010246
Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffffd1fb8176fd18 RCX: 0000000000000000
Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8d1a8eb28b00
Jun 14 18:08:32 dalek kernel: RBP: 0000000000000000 R08: ffffd1fb8176fc38 R09: ffffd1fb8176fc40
Jun 14 18:08:32 dalek kernel: R10: ffffd1fb8176fc34 R11: 0000000000000000 R12: 0000000000000000
Jun 14 18:08:32 dalek kernel: R13: ffffd1fb8176fd90 R14: 0000000000000001 R15: ffff8d1a8eb28b00
Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
Jun 14 18:08:32 dalek kernel: Call Trace:
Jun 14 18:08:32 dalek kernel:  <TASK>
Jun 14 18:08:32 dalek kernel:  do_region+0x227/0x2a0
Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
Jun 14 18:08:32 dalek kernel:  </TASK>
Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
Jun 14 18:08:32 dalek kernel: ------------[ cut here ]------------
Jun 14 18:08:32 dalek kernel: WARNING: drivers/scsi/scsi_lib.c:1164 at scsi_alloc_sgtables+0x38a/0x400, CPU#15: kworker/15:1/369
Jun 14 18:08:32 dalek kernel: Modules linked in: nft_masq nft_reject_ipv4 act_csum cls_u32 sch_htb nf_nat_tftp nf_conntrack_tftp bridge stp llc rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reje>
Jun 14 18:08:32 dalek kernel:  drm_panel_backlight_quirks gpu_sched drm_suballoc_helper video nvme drm_display_helper nvme_core cec nvme_keyring sp5100_tco nvme_auth wmi serio_raw fuse scsi_dh_alua i2c_dev scsi_dh_rdac scsi_dh_emc
Jun 14 18:08:32 dalek kernel: CPU: 15 UID: 0 PID: 369 Comm: kworker/15:1 Tainted: G        W           7.1.0-rc7+ #786 PREEMPT(lazy) 
Jun 14 18:08:32 dalek kernel: Tainted: [W]=WARN
Jun 14 18:08:32 dalek kernel: Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Pro4, BIOS P3.10 07/13/2020
Jun 14 18:08:32 dalek kernel: Workqueue: kmirrord do_mirror
Jun 14 18:08:32 dalek kernel: RIP: 0010:scsi_alloc_sgtables+0x38a/0x400
Jun 14 18:08:32 dalek kernel: Code: 8b 3d ba 2d a9 01 e9 d1 fd ff ff 48 8b 75 00 48 8d bb f0 fe ff ff e8 15 b7 b0 ff 48 89 ab e0 00 00 00 89 45 08 e9 30 ff ff ff <0f> 0b 4c 8b 6c 24 30 b8 0a 00 00 00 e9 21 ff ff ff b8 09 00 00 00
Jun 14 18:08:32 dalek kernel: RSP: 0018:ffffd1fb8176f7f0 EFLAGS: 00010246
Jun 14 18:08:32 dalek kernel: RAX: 0000000000000000 RBX: ffff8d1aedad0110 RCX: 0000000000000009
Jun 14 18:08:32 dalek kernel: RDX: 0000000000000000 RSI: ffffffff99c15960 RDI: ffff8d1aedad0110
Jun 14 18:08:32 dalek kernel: RBP: ffff8d1a93d17000 R08: ffff8d1aedad0110 R09: ffff8d1a818fa800
Jun 14 18:08:32 dalek kernel: R10: 7020676e69736961 R11: 0000000000000000 R12: 0000000000000000
Jun 14 18:08:32 dalek kernel: R13: 0000000000000000 R14: ffff8d1a93394000 R15: ffff8d1a93d17000
Jun 14 18:08:32 dalek kernel: FS:  0000000000000000(0000) GS:ffff8d29d161f000(0000) knlGS:0000000000000000
Jun 14 18:08:32 dalek kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 14 18:08:32 dalek kernel: CR2: 00007f0ddcd7b9d0 CR3: 000000023dcbf000 CR4: 0000000000350ef0
Jun 14 18:08:32 dalek kernel: Call Trace:
Jun 14 18:08:32 dalek kernel:  <TASK>
Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
Jun 14 18:08:32 dalek kernel:  sd_setup_read_write_cmnd+0x9d/0x740
Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
Jun 14 18:08:32 dalek kernel:  scsi_queue_rq+0x4d2/0x890
Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_rq_list+0x241/0x530
Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
Jun 14 18:08:32 dalek kernel:  ? sbitmap_get+0x61/0x100
Jun 14 18:08:32 dalek kernel:  __blk_mq_do_dispatch_sched+0x330/0x340
Jun 14 18:08:32 dalek kernel:  __blk_mq_sched_dispatch_requests+0x143/0x180
Jun 14 18:08:32 dalek kernel:  blk_mq_sched_dispatch_requests+0x2d/0x70
Jun 14 18:08:32 dalek kernel:  blk_mq_run_hw_queue+0x2bf/0x350
Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
Jun 14 18:08:32 dalek kernel:  blk_mq_dispatch_list+0x172/0x350
Jun 14 18:08:32 dalek kernel:  blk_mq_flush_plug_list+0x51/0x1a0
Jun 14 18:08:32 dalek kernel:  ? blk_mq_submit_bio+0x71c/0x9f0
Jun 14 18:08:32 dalek kernel:  __blk_flush_plug+0x112/0x180
Jun 14 18:08:32 dalek kernel:  ? srso_return_thunk+0x5/0x5f
Jun 14 18:08:32 dalek kernel:  __submit_bio+0x19c/0x260
Jun 14 18:08:32 dalek kernel:  __submit_bio_noacct+0x8e/0x210
Jun 14 18:08:32 dalek kernel:  do_region+0x14c/0x2a0
Jun 14 18:08:32 dalek kernel:  dispatch_io+0xf1/0x150
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
Jun 14 18:08:32 dalek kernel:  dm_io+0x169/0x2d0
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_get_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ? __pfx_bio_next_page+0x10/0x10
Jun 14 18:08:32 dalek kernel:  do_reads+0x149/0x230
Jun 14 18:08:32 dalek kernel:  ? __pfx_read_callback+0x10/0x10
Jun 14 18:08:32 dalek kernel:  do_mirror+0x11a/0x2b0
Jun 14 18:08:32 dalek kernel:  process_one_work+0x19e/0x390
Jun 14 18:08:32 dalek kernel:  worker_thread+0x1a6/0x310
Jun 14 18:08:32 dalek kernel:  ? __pfx_worker_thread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  kthread+0xe4/0x120
Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ret_from_fork+0x1a1/0x270
Jun 14 18:08:32 dalek kernel:  ? __pfx_kthread+0x10/0x10
Jun 14 18:08:32 dalek kernel:  ret_from_fork_asm+0x1a/0x30
Jun 14 18:08:32 dalek kernel:  </TASK>
Jun 14 18:08:32 dalek kernel: ---[ end trace 0000000000000000 ]---
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:32 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2
Jun 14 18:08:37 dalek kernel: blk_print_req_error: 241000 callbacks suppressed
Jun 14 18:08:37 dalek kernel: I/O error, dev sdb, sector 50606087 op 0x0:(READ) flags 0x0 phys_seg 0 prio class 2


-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* [PATCH 3/3] mm/swap: route slot free notifications through swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Dispatch slot_free_notify through swap_ops instead of
block_device_operations. Zram keeps slot-free handling alongside its
other swap_ops methods.

Move slot_trylock into the CONFIG_SWAP block. With CONFIG_SWAP=n it
has no callers and the build fails on -Werror=unused-function.

Document the callback locking rules in include/linux/swap.h. Remove
the outdated locking.rst note for swap_slot_free_notify.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 Documentation/filesystems/locking.rst |  5 --
 drivers/block/zram/zram_drv.c         | 88 ++++++++++++++++++-----------------
 include/linux/blkdev.h                |  2 -
 include/linux/swap.h                  |  7 +++
 mm/swapfile.c                         | 13 ++----
 rust/kernel/block/mq/gen_disk.rs      |  1 -
 6 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 70481bdc031d..964c841bf917 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -443,7 +443,6 @@ prototypes::
 				unsigned long *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*getgeo)(struct gendisk *, struct hd_geometry *);
-	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
 locking rules:
 
@@ -457,12 +456,8 @@ compat_ioctl:		no
 direct_access:		no
 unlock_native_capacity:	no
 getgeo:			no
-swap_slot_free_notify:	no	(see below)
 ======================= ===================
 
-swap_slot_free_notify is called with swap_lock and sometimes the page lock
-held.
-
 
 file_operations
 ===============
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9b2bd0287402..b78246dc1746 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -72,31 +72,6 @@ static void slot_lock_init(struct zram *zram, u32 index)
 			 &__key, 0);
 }
 
-/*
- * entry locking rules:
- *
- * 1) Lock is exclusive
- *
- * 2) lock() function can sleep waiting for the lock
- *
- * 3) Lock owner can sleep
- *
- * 4) Use TRY lock variant when in atomic context
- *    - must check return value and handle locking failers
- */
-static __must_check bool slot_trylock(struct zram *zram, u32 index)
-{
-	unsigned long *lock = &zram->table[index].__lock;
-
-	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
-		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
-		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
-		return true;
-	}
-
-	return false;
-}
-
 static void slot_lock(struct zram *zram, u32 index)
 {
 	unsigned long *lock = &zram->table[index].__lock;
@@ -2798,23 +2773,6 @@ static void zram_submit_bio(struct bio *bio)
 	}
 }
 
-static void zram_slot_free_notify(struct block_device *bdev,
-				unsigned long index)
-{
-	struct zram *zram;
-
-	zram = bdev->bd_disk->private_data;
-
-	atomic64_inc(&zram->stats.notify_free);
-	if (!slot_trylock(zram, index)) {
-		atomic64_inc(&zram->stats.miss_free);
-		return;
-	}
-
-	slot_free(zram, index);
-	slot_unlock(zram, index);
-}
-
 static void zram_comp_params_reset(struct zram *zram)
 {
 	u32 prio;
@@ -3058,6 +3016,50 @@ static void zram_swap_submit_write(struct swap_io_ctx *ctx)
 	swap_write_end(sio, failed);
 }
 
+/*
+ * entry locking rules:
+ *
+ * 1) Lock is exclusive
+ *
+ * 2) lock() function can sleep waiting for the lock
+ *
+ * 3) Lock owner can sleep
+ *
+ * 4) Use TRY lock variant when in atomic context
+ *    - must check return value and handle locking failers
+ */
+static __must_check bool slot_trylock(struct zram *zram, u32 index)
+{
+	unsigned long *lock = &zram->table[index].__lock;
+
+	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
+		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
+		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * swap_range_free() holds the swap cluster lock. Use slot_trylock() so
+ * we never block on a slot that is already locked elsewhere.
+ */
+static void zram_swap_slot_free_notify(struct swap_info_struct *sis,
+				       unsigned long index)
+{
+	struct zram *zram = sis->bdev->bd_disk->private_data;
+
+	atomic64_inc(&zram->stats.notify_free);
+	if (!slot_trylock(zram, index)) {
+		atomic64_inc(&zram->stats.miss_free);
+		return;
+	}
+
+	slot_free(zram, index);
+	slot_unlock(zram, index);
+}
+
 /*
  * No ->can_merge: block rules exist to grow bios on contiguous sectors and
  * matching blkcg.  zram already batches through swap_iocb, and
@@ -3068,6 +3070,7 @@ static void zram_swap_submit_write(struct swap_io_ctx *ctx)
 static const struct swap_ops zram_swap_ops = {
 	.submit_read		= zram_swap_submit_read,
 	.submit_write		= zram_swap_submit_write,
+	.slot_free_notify	= zram_swap_slot_free_notify,
 };
 
 #endif /* CONFIG_SWAP */
@@ -3075,7 +3078,6 @@ static const struct swap_ops zram_swap_ops = {
 static const struct block_device_operations zram_devops = {
 	.open = zram_open,
 	.submit_bio = zram_submit_bio,
-	.swap_slot_free_notify = zram_slot_free_notify,
 	.owner = THIS_MODULE
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..f861ceed39eb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1669,8 +1669,6 @@ struct block_device_operations {
 	int (*getgeo)(struct gendisk *, struct hd_geometry *);
 	int (*set_read_only)(struct block_device *bdev, bool ro);
 	void (*free_disk)(struct gendisk *disk);
-	/* this callback is with swap_lock and sometimes page table lock held */
-	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
 			    unsigned int nr_zones,
 			    struct blk_report_zones_args *args);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 70bf6f3f04dc..09640eb5a45d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -40,6 +40,11 @@ struct swap_io_ctx {
  *             the iocb is full or the plug is flushed.
  * @submit_write: flush the accumulated write ctx to the backend.
  * @submit_read: flush the accumulated read ctx to the backend.
+ * @slot_free_notify: optional callback invoked when a swap slot
+ *                    becomes free. swap_range_free() calls it with the
+ *                    swap cluster lock held. The folio lock may also be
+ *                    held on swap-cache teardown paths. Must not sleep
+ *                    or block.
  */
 struct swap_ops {
 	unsigned int		flags;
@@ -49,6 +54,8 @@ struct swap_ops {
 					     size_t prev_folio_size, int rw);
 	void			(*submit_write)(struct swap_io_ctx *ctx);
 	void			(*submit_read)(struct swap_io_ctx *ctx);
+	void			(*slot_free_notify)(struct swap_info_struct *sis,
+						    unsigned long offset);
 };
 
 int swap_register_block_ops(const struct block_device_operations *fops,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ebdc96092961..79a4166fb9bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1311,21 +1311,18 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 			    unsigned int nr_entries)
 {
 	unsigned long end = offset + nr_entries - 1;
-	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+	void (*slot_free_notify)(struct swap_info_struct *sis,
+				 unsigned long offset);
 	unsigned int i;
 
 	for (i = 0; i < nr_entries; i++)
 		zswap_invalidate(swp_entry(si->type, offset + i));
 
-	if (si->flags & SWP_BLKDEV)
-		swap_slot_free_notify =
-			si->bdev->bd_disk->fops->swap_slot_free_notify;
-	else
-		swap_slot_free_notify = NULL;
+	slot_free_notify = si->ops->slot_free_notify;
 	while (offset <= end) {
 		arch_swap_invalidate_page(si->type, offset);
-		if (swap_slot_free_notify)
-			swap_slot_free_notify(si->bdev, offset);
+		if (slot_free_notify)
+			slot_free_notify(si, offset);
 		offset++;
 	}
 
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
index 912cb805caf5..25552d69f711 100644
--- a/rust/kernel/block/mq/gen_disk.rs
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -135,7 +135,6 @@ pub fn build<T: Operations>(
             unlock_native_capacity: None,
             getgeo: None,
             set_read_only: None,
-            swap_slot_free_notify: None,
             report_zones: None,
             devnode: None,
             alternative_gpt_sector: None,

-- 
2.43.0


^ permalink raw reply related

* [PATCH 2/3] mm/zram: handle swap read/write via swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Register zram_swap_ops at module init.  The swap core still batches
folios into a swap_iocb; on flush, zram_swap_submit_write() maps each
folio page to its swap slot index and stores it via zram_write_page()
into the zspool, avoiding one bio per page.

For swap-in, zram_swap_submit_read() walks the same batch.  Without a
backing device, each slot is decompressed with read_from_zspool() while
slot_lock is held and mark_slot_accessed() runs in the same critical
section, so idle writeback cannot take the slot between read and mark.
When backing_dev is set, delegate the entire iocb to
swap_bdev_submit_read() because the batch may mix ZRAM_WB slots that
live on the backing block device.

Omit ->can_merge: zram batches through swap_iocb and compresses each
slot by index.  Block-sector merge rules do not apply.

Export swap_iocb_nr_folios(), swap_iocb_folio(), swap_read_end(),
swap_write_end(), and swap_bdev_submit_read() for the custom swap I/O
path.

Fail zram_init() if swap_register_block_ops() fails so the module
does not load without its swap path registered.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 drivers/block/zram/zram_drv.c | 127 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/swap.h          |   5 ++
 mm/page_io.c                  |  81 ++++++++++++++++++++++++++-
 3 files changed, 210 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7917fc7a2a29..9b2bd0287402 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -34,6 +34,8 @@
 #include <linux/part_stat.h>
 #include <linux/kernel_read_file.h>
 #include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #include "zram_drv.h"
 
@@ -55,6 +57,9 @@ static unsigned int num_devices = 1;
 static size_t huge_class_size;
 
 static const struct block_device_operations zram_devops;
+#if IS_ENABLED(CONFIG_SWAP)
+static bool zram_swap_ops_registered;
+#endif
 
 static void slot_free(struct zram *zram, u32 index);
 #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map)
@@ -2958,6 +2963,115 @@ static int zram_open(struct gendisk *disk, blk_mode_t mode)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_SWAP)
+static void zram_swap_submit_read(struct swap_io_ctx *ctx)
+{
+	struct zram *zram = ctx->sis->bdev->bd_disk->private_data;
+	struct swap_iocb *sio = ctx->sio;
+	int nr = swap_iocb_nr_folios(sio);
+	bool failed = false;
+	int i, j;
+
+	/*
+	 * With a backing device configured, the batch may include ZRAM_WB
+	 * slots.  Fall back to the block read path for the whole iocb
+	 * instead of checking each slot.
+	 */
+#ifdef CONFIG_ZRAM_WRITEBACK
+	if (zram->backing_dev) {
+		swap_bdev_submit_read(ctx);
+		return;
+	}
+#endif
+
+	for (i = 0; i < nr; i++) {
+		struct folio *folio = swap_iocb_folio(sio, i);
+		u32 base = swp_offset(folio->swap);
+
+		for (j = 0; j < folio_nr_pages(folio); j++) {
+			u32 idx = base + j;
+			struct page *page = folio_page(folio, j);
+			int ret;
+
+			/*
+			 * read_from_zspool() and mark_slot_accessed() must run
+			 * under the same slot_lock.  zram_read_page() unlocks
+			 * before returning, which leaves a window where
+			 * writeback can pick an idle slot we just read.
+			 */
+			slot_lock(zram, idx);
+			ret = read_from_zspool(zram, page, idx);
+			if (!ret)
+				mark_slot_accessed(zram, idx);
+			slot_unlock(zram, idx);
+			if (ret) {
+				failed = true;
+				atomic64_inc(&zram->stats.failed_reads);
+				pr_alert_ratelimited("Read-error on swap-device %s at index %u: err=%d\n",
+						     zram->disk->disk_name, idx, ret);
+				goto out;
+			}
+			flush_dcache_page(page);
+		}
+	}
+out:
+	swap_read_end(sio, failed);
+}
+
+static void zram_swap_submit_write(struct swap_io_ctx *ctx)
+{
+	struct zram *zram = ctx->sis->bdev->bd_disk->private_data;
+	struct swap_iocb *sio = ctx->sio;
+	int nr = swap_iocb_nr_folios(sio);
+	bool failed = false;
+	int i, j, ret = 0;
+	u32 idx = 0;
+
+	for (i = 0; i < nr; i++) {
+		struct folio *folio = swap_iocb_folio(sio, i);
+		u32 base = swp_offset(folio->swap);
+
+		for (j = 0; j < folio_nr_pages(folio); j++) {
+			idx = base + j;
+			ret = zram_write_page(zram, folio_page(folio, j), idx);
+			if (ret) {
+				/*
+				 * Leave partial zram data in place, same as the bio
+				 * write path.  swap_write_end() re-dirties every
+				 * page in the batch so they stay in swapcache with
+				 * their swap entries.  Freeing zram slots here would
+				 * leave entries pointing at empty indices until
+				 * slot_free_notify runs.
+				 */
+				failed = true;
+				atomic64_inc(&zram->stats.failed_writes);
+				pr_alert_ratelimited("Write-error on swap-device %s at index %u: err=%d\n",
+						     zram->disk->disk_name, idx, ret);
+				goto out;
+			}
+			slot_lock(zram, idx);
+			mark_slot_accessed(zram, idx);
+			slot_unlock(zram, idx);
+		}
+	}
+out:
+	swap_write_end(sio, failed);
+}
+
+/*
+ * No ->can_merge: block rules exist to grow bios on contiguous sectors and
+ * matching blkcg.  zram already batches through swap_iocb, and
+ * submit_write() compresses each slot by index, not by sector layout.
+ * Reusing swap_bdev_can_merge() would only split batches without helping
+ * zspool I/O.
+ */
+static const struct swap_ops zram_swap_ops = {
+	.submit_read		= zram_swap_submit_read,
+	.submit_write		= zram_swap_submit_write,
+};
+
+#endif /* CONFIG_SWAP */
+
 static const struct block_device_operations zram_devops = {
 	.open = zram_open,
 	.submit_bio = zram_submit_bio,
@@ -3233,6 +3347,10 @@ static int zram_remove_cb(int id, void *ptr, void *data)
 
 static void destroy_devices(void)
 {
+#if IS_ENABLED(CONFIG_SWAP)
+	if (zram_swap_ops_registered)
+		swap_unregister_block_ops(&zram_devops);
+#endif
 	class_unregister(&zram_control_class);
 	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
 	zram_debugfs_destroy();
@@ -3269,6 +3387,15 @@ static int __init zram_init(void)
 		return -EBUSY;
 	}
 
+#if IS_ENABLED(CONFIG_SWAP)
+	ret = swap_register_block_ops(&zram_devops, &zram_swap_ops);
+	if (ret) {
+		pr_err("zram: failed to register swap ops (%d)\n", ret);
+		goto out_error;
+	}
+	zram_swap_ops_registered = true;
+#endif
+
 	while (num_devices != 0) {
 		mutex_lock(&zram_index_mutex);
 		ret = zram_add();
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1d51df4179c1..70bf6f3f04dc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -54,6 +54,11 @@ struct swap_ops {
 int swap_register_block_ops(const struct block_device_operations *fops,
 			    const struct swap_ops *ops);
 void swap_unregister_block_ops(const struct block_device_operations *fops);
+int swap_iocb_nr_folios(struct swap_iocb *sio);
+struct folio *swap_iocb_folio(struct swap_iocb *sio, int idx);
+void swap_read_end(struct swap_iocb *sio, bool failed);
+void swap_write_end(struct swap_iocb *sio, bool failed);
+void swap_bdev_submit_read(struct swap_io_ctx *ctx);
 
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
diff --git a/mm/page_io.c b/mm/page_io.c
index 3ab620860379..7c17e44823d1 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -486,7 +486,21 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
 	delayacct_swapin_end();
 }
 
-static void swap_write_end(struct swap_iocb *sio, bool failed)
+/**
+ * swap_write_end - finish a swap write iocb
+ * @sio:    swap_iocb whose pages were just written
+ * @failed: true if any of the underlying writes failed
+ *
+ * Ends writeback on every page captured by @sio. On failure each page
+ * is also re-dirtied and PG_reclaim is cleared, mirroring the bio
+ * write completion path. @sio is returned to the swap iocb mempool.
+ *
+ * swap_ops providers must call this exactly once per submit_write()
+ * ctx (typically at the end of their submit_write callback).
+ *
+ * Context: any context the submit_write() callback runs in.
+ */
+void swap_write_end(struct swap_iocb *sio, bool failed)
 {
 	int p;
 
@@ -501,6 +515,7 @@ static void swap_write_end(struct swap_iocb *sio, bool failed)
 	}
 	mempool_free(sio, sio_pool);
 }
+EXPORT_SYMBOL_GPL(swap_write_end);
 
 static void swap_fs_write_complete(struct kiocb *iocb, long ret)
 {
@@ -536,7 +551,26 @@ static void end_swap_bio_write(struct bio *bio)
 	swap_write_end(sio, failed);
 }
 
-static void swap_read_end(struct swap_iocb *sio, bool failed)
+/**
+ * swap_read_end - finish a swap read iocb
+ * @sio:    swap_iocb whose folios were just read in
+ * @failed: true if any of the underlying reads failed
+ *
+ * Unlocks every folio captured by @sio. On success each folio is also
+ * marked uptodate and swap-in counters (PSWPIN, mTHP, memcg) are bumped
+ * by folio_nr_pages(). On failure folios are left not-uptodate so the
+ * caller observes the failure and retries or surfaces an error. @sio is
+ * returned to the swap iocb mempool.
+ *
+ * swap_ops providers must call this exactly once per submit_read() ctx
+ * (typically at the end of their submit_read callback). If the provider
+ * defers to swap_bdev_ops.submit_read() for fallback, the bdev path
+ * will call swap_read_end() itself and the provider must not call it
+ * again for the same ctx.
+ *
+ * Context: any context the submit_read() callback runs in.
+ */
+void swap_read_end(struct swap_iocb *sio, bool failed)
 {
 	int p;
 
@@ -557,6 +591,34 @@ static void swap_read_end(struct swap_iocb *sio, bool failed)
 
 	mempool_free(sio, sio_pool);
 }
+EXPORT_SYMBOL_GPL(swap_read_end);
+
+/**
+ * swap_iocb_nr_folios - number of folios in a swap I/O batch
+ * @sio: swap_iocb passed to a swap_ops submit callback.
+ *
+ * Returns how many folios the swap core has batched into @sio. Used
+ * together with swap_iocb_folio() so swap_ops providers can walk the
+ * batch without depending on the swap core's internal iocb layout.
+ */
+int swap_iocb_nr_folios(struct swap_iocb *sio)
+{
+	return sio->nr_bvecs;
+}
+EXPORT_SYMBOL_GPL(swap_iocb_nr_folios);
+
+/**
+ * swap_iocb_folio - folio at slot @idx in a swap I/O batch
+ * @sio: swap_iocb passed to a swap_ops submit callback.
+ * @idx: index in the range [0, swap_iocb_nr_folios(@sio)).
+ *
+ * Returns the folio at the given batch slot.
+ */
+struct folio *swap_iocb_folio(struct swap_iocb *sio, int idx)
+{
+	return page_folio(sio->bvecs[idx].bv_page);
+}
+EXPORT_SYMBOL_GPL(swap_iocb_folio);
 
 static void swap_fs_read_complete(struct kiocb *iocb, long ret)
 {
@@ -613,7 +675,19 @@ static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
 	}
 }
 
-static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
+/**
+ * swap_bdev_submit_read - fall back to the default block-device read path
+ * @ctx: in-progress submit_read context.
+ *
+ * Builds a bio for the accumulated ctx and submits it through the
+ * normal block layer. swap_ops providers can call this when they
+ * cannot serve a particular ctx themselves (for example zram folios
+ * stored on a backing device). The bio completion path takes care of
+ * calling swap_read_end() on @ctx. The caller must not call it again.
+ *
+ * Context: any context the submit_read() callback runs in.
+ */
+void swap_bdev_submit_read(struct swap_io_ctx *ctx)
 {
 	struct swap_iocb *sio = ctx->sio;
 	struct bio *bio = &sio->bio;
@@ -638,6 +712,7 @@ static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
 		submit_bio(bio);
 	}
 }
+EXPORT_SYMBOL_GPL(swap_bdev_submit_read);
 
 static bool swap_bdev_can_merge(struct folio *folio, struct folio *prev_folio,
 		size_t prev_folio_size, int rw)

-- 
2.43.0


^ permalink raw reply related

* [PATCH 1/3] mm/page_io: let block drivers register custom swap I/O ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

Add swap_register_block_ops() so a block driver can install custom
swap read/write handlers instead of always building bios.

When swapon targets a block device (S_ISBLK), setup_swap_extents()
checks whether that driver's block_device_operations were registered.
If yes, sis->ops points at the driver table. Otherwise sis->ops
stays on swap_bdev_ops.

Swap files are unchanged. They still use the filesystem path and
extent tree, because their page index is not a raw disk sector.

Register swap_ops in a single global slot keyed by the driver's
block_device_operations. lookup_swap_block_ops() matches sis->bdev
fops at swapon. -EBUSY if the slot is already taken. That is enough
while only zram needs custom swap I/O. Several block drivers would
need a per-fops lookup table instead.

swap_unregister_block_ops() must pass the same fops that
registered. Swap areas created before unregister keep the old ops
until swapoff. The driver module must remain loaded while they are
in use.

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
 include/linux/swap.h |  35 +++++++++++++++++
 mm/page_io.c         | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap.h            |  18 +--------
 mm/swapfile.c        |   4 ++
 4 files changed, 147 insertions(+), 16 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 636d94108166..1d51df4179c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -19,6 +19,41 @@
 struct notifier_block;
 
 struct bio;
+struct block_device_operations;
+struct folio;
+struct swap_iocb;
+struct swap_info_struct;
+
+struct swap_io_ctx {
+	struct swap_iocb	*sio;
+	struct swap_info_struct	*sis;
+};
+
+/* Set when the swap backend requires GFP_NOFS allocations. */
+#define SWAP_OPS_F_NOFS		(1U << 0)
+
+/**
+ * struct swap_ops - per-swap-area I/O batching callbacks
+ * @can_merge: optional. Return true iff @folio can be appended to a ctx
+ *             that already holds @prev_folio of @prev_folio_size bytes.
+ *             When NULL, folios on the same swap area are batched until
+ *             the iocb is full or the plug is flushed.
+ * @submit_write: flush the accumulated write ctx to the backend.
+ * @submit_read: flush the accumulated read ctx to the backend.
+ */
+struct swap_ops {
+	unsigned int		flags;
+
+	bool			(*can_merge)(struct folio *folio,
+					     struct folio *prev_folio,
+					     size_t prev_folio_size, int rw);
+	void			(*submit_write)(struct swap_io_ctx *ctx);
+	void			(*submit_read)(struct swap_io_ctx *ctx);
+};
+
+int swap_register_block_ops(const struct block_device_operations *fops,
+			    const struct swap_ops *ops);
+void swap_unregister_block_ops(const struct block_device_operations *fops);
 
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
diff --git a/mm/page_io.c b/mm/page_io.c
index c020e8ebf966..3ab620860379 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -24,6 +24,8 @@
 #include <linux/uio.h>
 #include <linux/sched/task.h>
 #include <linux/delayacct.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
 #include <linux/zswap.h>
 #include "swap.h"
 #include "swap_table.h"
@@ -325,6 +327,8 @@ static bool swap_can_merge(struct swap_io_ctx *ctx, struct folio *folio,
 
 	if (ctx->sis != sis)
 		return false;
+	if (!sis->ops->can_merge)
+		return true;
 	return sis->ops->can_merge(folio, prev_folio, prev_folio_size, rw);
 }
 
@@ -577,6 +581,18 @@ static void swap_bio_read_end_io(struct bio *bio)
 	swap_read_end(sio, failed);
 }
 
+/**
+ * swap_bdev_submit_write - default block-device write path for swap
+ * @ctx: in-progress submit_write context.
+ *
+ * Builds a bio for the accumulated ctx and submits it through the normal
+ * block layer. This is the submit_write implementation used by swap_bdev_ops
+ * for ordinary block swap areas. swap_ops providers that override submit_write
+ * (e.g. zram) but still fall back to the block layer for some I/Os should use
+ * their own bio construction, this function is not exported.
+ *
+ * Context: process context (may sleep if SWP_SYNCHRONOUS_IO is set).
+ */
 static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
 {
 	struct swap_iocb *sio = ctx->sio;
@@ -640,6 +656,96 @@ const struct swap_ops swap_bdev_ops = {
 	.can_merge		= swap_bdev_can_merge,
 };
 
+static DEFINE_MUTEX(swap_block_ops_lock);
+static const struct block_device_operations *swap_block_fops;
+static const struct swap_ops *swap_block_ops;
+
+/**
+ * swap_register_block_ops - install swap callbacks for a block driver
+ * @fops: block_device_operations identifying the driver. Used as a
+ *        match key in setup_swap_extents(): a S_ISBLK swap area is
+ *        routed to @ops when its bdev's gendisk fops equals @fops.
+ * @ops:  swap_ops vtable selected for matching swap areas. Must populate
+ *        ->submit_read and ->submit_write. ->can_merge is optional.
+ *
+ * Lets a block driver (zram and similar) replace the default
+ * swap_bdev_ops with its own submit_read / submit_write implementation.
+ *
+ * Returns 0 on success, -EINVAL when @fops or @ops are bad (a required
+ * callback is missing), or -EBUSY when the single registration slot is
+ * already taken. That slot is enough while only zram needs custom swap I/O.
+ * Several block drivers would need a per-fops lookup table instead.
+ *
+ * Context: process context, may sleep.
+ */
+int swap_register_block_ops(const struct block_device_operations *fops,
+			    const struct swap_ops *ops)
+{
+	int ret;
+
+	if (WARN_ON_ONCE(!fops || !ops || !ops->submit_read ||
+			 !ops->submit_write))
+		return -EINVAL;
+
+	mutex_lock(&swap_block_ops_lock);
+	if (swap_block_fops || swap_block_ops) {
+		ret = -EBUSY;
+		goto out;
+	}
+	swap_block_fops = fops;
+	swap_block_ops = ops;
+	ret = 0;
+out:
+	mutex_unlock(&swap_block_ops_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(swap_register_block_ops);
+
+/**
+ * swap_unregister_block_ops - undo swap_register_block_ops()
+ * @fops: same block_device_operations passed to swap_register_block_ops().
+ *
+ * Clears the registered fops/ops slot so future swapon calls fall back
+ * to swap_bdev_ops. The @fops match acts as a soft owner check so a
+ * driver cannot accidentally tear down another driver's registration.
+ * A mismatch is treated as a bug and triggers WARN_ON_ONCE. Swap areas
+ * that already captured the registered ops keep their sis->ops pointer.
+ * The caller must ensure the module owning the ops outlives any such
+ * swap area. For block drivers this is guaranteed by the bdev open
+ * reference held across swapon.
+ * Calling unregister before a successful register is a no-op.
+ *
+ * Context: process context, may sleep.
+ */
+void swap_unregister_block_ops(const struct block_device_operations *fops)
+{
+	mutex_lock(&swap_block_ops_lock);
+	/* never registered or already unregistered. */
+	if (!swap_block_fops)
+		goto out;
+	if (WARN_ON_ONCE(swap_block_fops != fops))
+		goto out;
+	swap_block_fops = NULL;
+	swap_block_ops = NULL;
+out:
+	mutex_unlock(&swap_block_ops_lock);
+}
+EXPORT_SYMBOL_GPL(swap_unregister_block_ops);
+
+const struct swap_ops *lookup_swap_block_ops(struct swap_info_struct *sis)
+{
+	const struct swap_ops *ops = NULL;
+
+	if (!sis->bdev)
+		return NULL;
+
+	mutex_lock(&swap_block_ops_lock);
+	if (swap_block_fops && sis->bdev->bd_disk->fops == swap_block_fops)
+		ops = swap_block_ops;
+	mutex_unlock(&swap_block_ops_lock);
+	return ops;
+}
+
 static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
 {
 	struct swap_iocb *sio = ctx->sio;
diff --git a/mm/swap.h b/mm/swap.h
index edb512e619ee..4bdd38f7a5e8 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h> /* for atomic_long_t */
 #include <linux/mm.h> /* for PAGE_SHIFT */
+#include <linux/swap.h>
 
 struct mempolicy;
 struct swap_iocb;
@@ -79,22 +80,6 @@ enum swap_cluster_flags {
 	CLUSTER_FLAG_MAX,
 };
 
-struct swap_io_ctx {
-	struct swap_iocb	*sio;
-	struct swap_info_struct	*sis;
-};
-
-#define SWAP_OPS_F_NOFS		(1U << 0)
-
-struct swap_ops {
-	unsigned int		flags;
-
-	bool (*can_merge)(struct folio *folio, struct folio *prev_folio,
-			size_t prev_folio_size, int rw);
-	void (*submit_write)(struct swap_io_ctx *ctx);
-	void (*submit_read)(struct swap_io_ctx *ctx);
-};
-
 #ifdef CONFIG_SWAP
 #include <linux/swapops.h> /* for swp_offset */
 #include <linux/blk_types.h> /* for bio_end_io_t */
@@ -472,6 +457,7 @@ static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
 #endif /* CONFIG_SWAP */
 
 extern const struct swap_ops swap_bdev_ops;
+const struct swap_ops *lookup_swap_block_ops(struct swap_info_struct *sis);
 
 int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
 		struct list_head *folio_list);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 284eebc40a70..ebdc96092961 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2849,6 +2849,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
 	sis->ops = &swap_bdev_ops;
 
 	if (S_ISBLK(inode->i_mode)) {
+		const struct swap_ops *block_ops = lookup_swap_block_ops(sis);
+
+		if (block_ops)
+			sis->ops = block_ops;
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
 		return ret;

-- 
2.43.0


^ permalink raw reply related

* [PATCH 0/3] mm/zram: route block swap I/O through swap_ops
From: Jianyue Wu @ 2026-06-14 15:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham, Barry Song,
	Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc, Jianyue Wu

This series builds on Christoph Hellwig's swap batching rework that
moves block swap onto struct swap_iocb and per-backend struct
swap_ops handlers [1].  Christoph's patches unify batching for
ordinary block devices and swap files.  zram still needs a custom
path because swap slots map to compressed pages, not disk sectors.

The first patch adds swap_register_block_ops() so a block driver can
install custom submit_read/submit_write handlers when swapon targets
its block device.  The default swap_bdev_ops path is unchanged for
devices that do not register.

The second patch registers zram_swap_ops at module init.  On write,
the swap core still batches folios into a swap_iocb.  zram maps each
folio to a slot index and stores it through zram_write_page() instead
of building one bio per page.  Read handling keeps slot_lock and
mark_slot_accessed() in one critical section.  Writeback-enabled zram
falls back to swap_bdev_submit_read() for ZRAM_WB slots.

The third patch moves slot_free_notify into swap_ops next to the
other zram swap callbacks, and documents the locking contract for
that hook.

Applied on top of Christoph Hellwig's "better block swap batching and
a different take on swap_ops" series [1].

[1] https://lore.kernel.org/linux-mm/?q=better+block+swap+batching

To: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Youngjun Park <youngjun.park@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-block@vger.kernel.org
Cc: linux-doc@vger.kernel.org

Signed-off-by: Jianyue Wu <wujianyue000@gmail.com>
---
Jianyue Wu (3):
      mm/page_io: let block drivers register custom swap I/O ops
      mm/zram: handle swap read/write via swap_ops
      mm/swap: route slot free notifications through swap_ops

 Documentation/filesystems/locking.rst |   5 -
 drivers/block/zram/zram_drv.c         | 215 +++++++++++++++++++++++++++-------
 include/linux/blkdev.h                |   2 -
 include/linux/swap.h                  |  47 ++++++++
 mm/page_io.c                          | 187 ++++++++++++++++++++++++++++-
 mm/swap.h                             |  18 +--
 mm/swapfile.c                         |  17 +--
 rust/kernel/block/mq/gen_disk.rs      |   1 -
 8 files changed, 414 insertions(+), 78 deletions(-)
---
base-commit: 842f51deada6449843f811bfa22e536a01ae5a0c
change-id: 20260614-zram-swap-ops-block-register-a1b2c3d4e5f6

Best regards,
-- 
Jianyue Wu <wujianyue000@gmail.com>

^ permalink raw reply

* Re: [PATCH blktests v2] throtl/008: Add a test for the iocost cgroup controller
From: Shin'ichiro Kawasaki @ 2026-06-14  6:30 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Damien Le Moal, linux-block
In-Reply-To: <20260604175423.3809638-1-bvanassche@acm.org>

On Jun 04, 2026 / 10:54, Bart Van Assche wrote:
> Add a test for read and write IOPS throttling.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>

Thanks for this v2 patch. I applied it with two minor changes below.

> diff --git a/tests/throtl/008 b/tests/throtl/008
> new file mode 100755
> index 000000000000..f4d3b080797a
> --- /dev/null
> +++ b/tests/throtl/008
> @@ -0,0 +1,104 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-3.0+
> +# Copyright (C) 2026 Google LLC
> +#
> +# Test cgroup iocost IOPS limiting.
> +
> +. tests/throtl/rc
> +. common/fio
> +
> +DESCRIPTION="test cgroup iocost controller limits"
> +
> +requires() {
> +	_have_fio
> +	_have_program bc

This check above is in group_requires(), so I dropped it.

> +	_have_kernel_option BLK_CGROUP_IOCOST
> +}

I added set_conditions() here, so that this test can be run for both
null_blk and scsi_debug. It also makes this test case consistent with
other test cases in this group.

> +
> +run_test() {
> +	# dev_t is global to make it available in the caller.
> +	dev_t=$(<"/sys/block/${THROTL_DEV}/dev")

...

^ permalink raw reply

* [PATCH] blk-iocost: correct CONFIG_TRACEPOINTS macro name in comments
From: Ethan Nelson-Moore @ 2026-06-13 22:54 UTC (permalink / raw)
  To: cgroups, linux-block
  Cc: Ethan Nelson-Moore, Tejun Heo, Josef Bacik, Jens Axboe

Comments in block/blk-iocost.c incorrectly refer to
CONFIG_TRACE_POINTS instead of CONFIG_TRACEPOINTS. Correct them.

Discovered while searching for CONFIG_* symbols referenced in code but
not defined in any Kconfig file.

Signed-off-by: Ethan Nelson-Moore <enelsonmoore@gmail.com>
---
 block/blk-iocost.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 0cca88a366dc..04630c36b737 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -205,9 +205,9 @@ static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 		}								\
 	} while (0)
 
-#else	/* CONFIG_TRACE_POINTS */
+#else	/* CONFIG_TRACEPOINTS */
 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
-#endif	/* CONFIG_TRACE_POINTS */
+#endif	/* CONFIG_TRACEPOINTS */
 
 enum {
 	MILLION			= 1000000,
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH Resend] MAINTAINERS: Update Coly Li's email address
From: Coly Li @ 2026-06-13 15:30 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-bcache, linux-block, linux-kernel
In-Reply-To: <178136456783.1954753.8358889895275169463.b4-ty@b4>

> 2026年6月13日 23:29，Jens Axboe <axboe@kernel.dk> 写道：
> 
> 
> On Sat, 13 Jun 2026 23:04:58 +0800, colyli@fygo.io wrote:
>> I switch to colyli@fygo.io as my current email address.
> 
> Applied, thanks!
> 
> [1/1] MAINTAINERS: Update Coly Li's email address
>      commit: c7c76f9232bd34835d821f14abdc5fafc17bc938

Thanks a lot!

Coly Li

^ permalink raw reply

* Re: [PATCH Resend] MAINTAINERS: Update Coly Li's email address
From: Jens Axboe @ 2026-06-13 15:29 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, linux-block, linux-kernel
In-Reply-To: <20260613150458.682707-1-colyli@fygo.io>


On Sat, 13 Jun 2026 23:04:58 +0800, colyli@fygo.io wrote:
> I switch to colyli@fygo.io as my current email address.

Applied, thanks!

[1/1] MAINTAINERS: Update Coly Li's email address
      commit: c7c76f9232bd34835d821f14abdc5fafc17bc938

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH Resend] MAINTAINERS: Update Coly Li's email address
From: Coly Li @ 2026-06-13 15:11 UTC (permalink / raw)
  To: axboe; +Cc: linux-bcache, linux-block, linux-kernel
In-Reply-To: <20260613150458.682707-1-colyli@fygo.io>

Hi Jens,

I assume this patch goes into the spam box again (due to the DKIM issue). I use my personal email address to gently notice you.
Could you please take this patch to update my current employer email address.

Thanks in advance.

Coly Li

> 2026年6月13日 23:04，colyli@fygo.io 写道：
> 
> From: Coly Li <colyli@fygo.io>
> 
> I switch to colyli@fygo.io as my current email address.
> 
> Signed-off-by: Coly Li <colyli@fygo.io>
> ---
> MAINTAINERS | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8856f10a72bd..d2aaa7dacf90 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4446,11 +4446,11 @@ F: Documentation/networking/batman-adv.rst
> F: include/uapi/linux/batadv_packet.h
> F: include/uapi/linux/batman_adv.h
> F: net/batman-adv/
> 
> BCACHE (BLOCK LAYER CACHE)
> -M: Coly Li <colyli@fnnas.com>
> +M: Coly Li <colyli@fygo.io>
> M: Kent Overstreet <kent.overstreet@linux.dev>
> L: linux-bcache@vger.kernel.org
> S: Maintained
> W: http://bcache.evilpiepirate.org
> C: irc://irc.oftc.net/bcache
> -- 
> 2.47.3
> 
>

^ permalink raw reply

* [PATCH Resend] MAINTAINERS: Update Coly Li's email address
From: colyli @ 2026-06-13 15:04 UTC (permalink / raw)
  To: axboe; +Cc: linux-bcache, linux-block, linux-kernel, Coly Li

From: Coly Li <colyli@fygo.io>

I switch to colyli@fygo.io as my current email address.

Signed-off-by: Coly Li <colyli@fygo.io>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8856f10a72bd..d2aaa7dacf90 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4446,11 +4446,11 @@ F:	Documentation/networking/batman-adv.rst
 F:	include/uapi/linux/batadv_packet.h
 F:	include/uapi/linux/batman_adv.h
 F:	net/batman-adv/
 
 BCACHE (BLOCK LAYER CACHE)
-M:	Coly Li <colyli@fnnas.com>
+M:	Coly Li <colyli@fygo.io>
 M:	Kent Overstreet <kent.overstreet@linux.dev>
 L:	linux-bcache@vger.kernel.org
 S:	Maintained
 W:	http://bcache.evilpiepirate.org
 C:	irc://irc.oftc.net/bcache
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH] block: check bio split for unaligned bvec
From: Jens Axboe @ 2026-06-13 12:36 UTC (permalink / raw)
  To: linux-block, Keith Busch; +Cc: hch, Keith Busch, Carlos Maiolino
In-Reply-To: <20260612223205.465913-1-kbusch@meta.com>


On Fri, 12 Jun 2026 15:32:04 -0700, Keith Busch wrote:
> Offsets and lengths need to be validated against the dma alignment. This
> check was skipped for sufficiently a small bio with a single bvec, which
> may allow an invalid request dispatched to the driver. Force the
> validation for an unaligned bvec by forcing the bio split path that
> handles this condition.
> 
> 
> [...]

Applied, thanks!

[1/1] block: check bio split for unaligned bvec
      commit: 9b0c3673c88588d613d8f09f5931b2b466c6a83d

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH] nbd: Reclassify sockets to avoid lockdep circular dependency
From: Jens Axboe @ 2026-06-13 12:34 UTC (permalink / raw)
  To: Josef Bacik, Eric Dumazet
  Cc: linux-kernel, linux-block, nbd, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Kuniyuki Iwashima, netdev,
	syzbot+607cdcf978b3e79da878
In-Reply-To: <20260613042619.1108126-1-edumazet@google.com>


On Sat, 13 Jun 2026 04:26:19 +0000, Eric Dumazet wrote:
> syzbot reported a possible circular locking dependency in udp_sendmsg()
> where fs_reclaim can be triggered while holding sk_lock, and fs_reclaim
> can eventually depend on another sk_lock (e.g., if NBD is used for swap
> or writeback and NBD uses TLS/TCP which acquires sk_lock).
> 
> Since the UDP socket and the NBD TCP/TLS socket are different, this is a
> false positive. Fix this by reclassifying NBD sockets to a separate lock
> class when they are added to the NBD device.
> 
> [...]

Applied, thanks!

[1/1] nbd: Reclassify sockets to avoid lockdep circular dependency
      commit: d532cddb6c6049ced414d64d83c6ce7149a6421a

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH v4] loop: Fix NULL pointer dereference in lo_rw_aio()
From: Tetsuo Handa @ 2026-06-13 11:00 UTC (permalink / raw)
  To: Al Viro
  Cc: Jens Axboe, Bart Van Assche, Christoph Hellwig, Damien Le Moal,
	Ming Lei, linux-block, LKML, Andrew Morton, Linus Torvalds,
	linux-btrfs, David Sterba, linux-fsdevel, Christian Brauner,
	Hillf Danton
In-Reply-To: <20260609175013.GH2636677@ZenIV>

On 2026/06/10 2:50, Al Viro wrote:
> Still breaks xfs/259, same as the version in next-20260605...

I installed xfstests-dev and reproduced a "umount: /home/test: target is busy." problem which Al Viro is
experiencing with https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?h=next-20260608&id=fb1d5846e99c8aa4ce8da7e6ee7643b01da25b8c .

--------------------------------------------------------------------------------
./check xfs/259
FSTYP         -- xfs (non-debug)
PLATFORM      -- Linux/x86_64 kvm1 7.1.0-rc7-next-20260608+ #72 SMP PREEMPT_DYNAMIC Tue Jun  9 18:27:54 EDT 2026
MKFS_OPTIONS  -- -f /dev/sdb2
MOUNT_OPTIONS -- /dev/sdb2 /home/scratch

xfs/259 3s ... umount: /home/test: target is busy.
_check_xfs_filesystem: filesystem on /dev/sdb1 has dirty log
(see /home/al/xfstests/results//xfs/259.full for details)
_check_xfs_filesystem: filesystem on /dev/sdb1 is inconsistent (r)
(see /home/al/xfstests/results//xfs/259.full for details)

Ran: xfs/259
Failures: xfs/259
Failed 1 of 1 tests
--------------------------------------------------------------------------------



Below I describe what I tried, but my opinion is that this "target is busy" problem
should be addressed by xfstests side (i.e. retry umount after a short sleep when
umount failed with "target is busy" error).

--------------------------------------------------------------------------------
# fdisk -l
Disk /dev/nvme0n1: 40 GiB, 42949672960 bytes, 83886080 sectors
Disk model: VMware Virtual NVMe Disk
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disklabel type: gpt
Disk identifier: 068F8BED-D8BE-4795-8ABD-47A089653C58

Device         Start      End  Sectors Size Type
/dev/nvme0n1p1  2048     4095     2048   1M BIOS boot
/dev/nvme0n1p2  4096 83884031 83879936  40G Linux filesystem


Disk /dev/sda: 20 GiB, 21474836480 bytes, 41943040 sectors
Disk model: VMware Virtual S
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disklabel type: dos
Disk identifier: 0x1c8e18a6

Device     Boot    Start      End  Sectors Size Id Type
/dev/sda1           2048 20970495 20968448  10G 83 Linux
/dev/sda2       20971520 41943039 20971520  10G 83 Linux
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
cd
git clone https://kernel.googlesource.com/pub/scm/linux/kernel/git/cem/xfstests-dev
cd xfstests-dev/
make -j10
make install
cd /var/lib/xfstests/
TEST_SZ=$(blockdev --getsz /dev/sda1)
SCRATCH_SZ=$(blockdev --getsz /dev/sda2)
dmsetup create delayed_test --table "0 $TEST_SZ delay /dev/sda1 0 10"
dmsetup create delayed_scratch --table "0 $SCRATCH_SZ delay /dev/sda2 0 10"
export FSTYP=xfs
export TEST_DEV=/dev/mapper/delayed_test
export TEST_DIR=/home/test
export SCRATCH_DEV=/dev/mapper/delayed_scratch
export SCRATCH_MNT=/home/scratch
mkdir -p $TEST_DIR $SCRATCH_MNT
cd /var/lib/xfstests/
./check xfs/259
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
FSTYP         -- xfs (non-debug)
PLATFORM      -- Linux/x86_64 localhost 7.1.0-rc7-next-20260608 #22 SMP PREEMPT_DYNAMIC Sat Jun 13 18:50:49 JST 2026
MKFS_OPTIONS  -- -f /dev/mapper/delayed_scratch
MOUNT_OPTIONS -- /dev/mapper/delayed_scratch /home/scratch

xfs/259 61s ... umount: /home/test: target is busy.
_check_xfs_filesystem: filesystem on /dev/mapper/delayed_test has dirty log
(see /var/lib/xfstests/results//xfs/259.full for details)
_check_xfs_filesystem: filesystem on /dev/mapper/delayed_test is inconsistent (r)
(see /var/lib/xfstests/results//xfs/259.full for details)
Trying to repair broken TEST_DEV file system
_repair_test_fs: failed, err=1
(see /var/lib/xfstests/results//xfs/259.full for details)

Ran: xfs/259
Failures: xfs/259
Failed 1 of 1 tests

--------------------------------------------------------------------------------

I initially suspected that the cause of "target is busy" error is that fput() from
__loop_clr_fd() does not wait for completion before "losetup -d" completes. But a
debug printk() patch shown below indicated a tendency:

  (a) __loop_clr_fd() is called by "udev-worker" rather than "losetup" when this problem happens

  (b) propagate_mount_busy()!=0 when do_umount() fails with -EBUSY

--------------------------------------------------------------------------------
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c3b607a3ddc4..7408f314a1fa 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1763,6 +1763,8 @@ static void lo_release(struct gendisk *disk)
 	mutex_unlock(&lo->lo_mutex);
 
 	if (need_clear) {
+		printk("Flush: task=%s[%d] dev=loop%d state=%d\n",
+		       current->comm, current->pid, lo->lo_number, lo->lo_state);
 		/*
 		 * Temporarily release disk->open_mutex in order to flush pending I/O
 		 * requests before clearing the backing device.
@@ -1813,6 +1815,8 @@ static void lo_release(struct gendisk *disk)
 		mutex_lock(&lo->lo_disk->open_mutex);
 		if (WARN_ON(data_race(READ_ONCE(lo->lo_state)) != Lo_rundown))
 			return;
+		printk("Teardown: task=%s[%d] dev=loop%d state=%d\n",
+		       current->comm, current->pid, lo->lo_number, lo->lo_state);
 		__loop_clr_fd(lo);
 	}
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 09ab7fc72f86..9710460fb449 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1893,6 +1893,8 @@ static int do_umount(struct mount *mnt, int flags)
 		 */
 		lock_mount_hash();
 		if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
+			printk("%s: task=%s[%d] !list_empty(&mnt->mnt_mounts)=%d mnt_get_count(mnt)=%d\n", __func__,
+			       current->comm, current->pid, !list_empty(&mnt->mnt_mounts), mnt_get_count(mnt));
 			unlock_mount_hash();
 			return -EBUSY;
 		}
@@ -1960,6 +1962,9 @@ static int do_umount(struct mount *mnt, int flags)
 		if (!propagate_mount_busy(mnt, 2)) {
 			umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
+		} else {
+			printk("%s: task=%s[%d] propagate_mount_busy()!=0\n", __func__,
+			       current->comm, current->pid);
 		}
 	}
 out:
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
[   77.754264] XFS (dm-0): Mounting V5 Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[   78.161012] XFS (dm-0): Ending clean mount
[   79.882958] XFS (dm-1): Mounting V5 Filesystem f64fdceb-5dd4-489f-ae3f-f14289964a20
[   80.264828] XFS (dm-1): Ending clean mount
[   80.466146] XFS (dm-1): Unmounting Filesystem f64fdceb-5dd4-489f-ae3f-f14289964a20
[   80.547159] XFS (dm-0): Unmounting Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[   81.477419] XFS (dm-0): Mounting V5 Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[   81.871023] XFS (dm-0): Ending clean mount
[   82.008600] run fstests xfs/259 at 2026-06-13 19:39:59
[   83.218605] loop: module loaded
[   83.315929] loop0: detected capacity change from 0 to 8589934584
[   84.897469] loop1: detected capacity change from 0 to 8589934584
[   84.997221] Flush: task=(udev-worker)[1134] dev=loop0 state=2
[   85.014004] Teardown: task=(udev-worker)[1134] dev=loop0 state=2
[   86.354657] Flush: task=losetup[2016] dev=loop1 state=2
[   86.377065] Teardown: task=losetup[2016] dev=loop1 state=2
[   86.416717] loop0: detected capacity change from 0 to 8589934584
[   87.915638] loop1: detected capacity change from 0 to 8589934588
[   88.025704] Flush: task=(udev-worker)[1134] dev=loop0 state=2
[   88.039956] Teardown: task=(udev-worker)[1134] dev=loop0 state=2
[   89.404018] Flush: task=losetup[2048] dev=loop1 state=2
[   89.418144] Teardown: task=losetup[2048] dev=loop1 state=2
[   89.460259] loop0: detected capacity change from 0 to 8589934588
[   91.034214] loop1: detected capacity change from 0 to 8589934588
[   91.149505] Flush: task=(udev-worker)[1134] dev=loop0 state=2
[   91.162932] Teardown: task=(udev-worker)[1134] dev=loop0 state=2
[   92.626690] Flush: task=losetup[2078] dev=loop1 state=2
[   92.652036] Teardown: task=losetup[2078] dev=loop1 state=2
[   92.699585] loop0: detected capacity change from 0 to 8589934590
[   94.307750] loop1: detected capacity change from 0 to 8589934590
[   94.424337] Flush: task=(udev-worker)[1134] dev=loop0 state=2
[   94.441842] Teardown: task=(udev-worker)[1134] dev=loop0 state=2
[   95.869110] Flush: task=losetup[2108] dev=loop1 state=2
[   95.885815] Teardown: task=losetup[2108] dev=loop1 state=2
[   95.931349] loop0: detected capacity change from 0 to 8589934590
[   97.447354] do_umount: task=umount[2143] propagate_mount_busy()!=0
[   97.549813] Flush: task=(udev-worker)[1134] dev=loop0 state=2
[   97.567983] Teardown: task=(udev-worker)[1134] dev=loop0 state=2

[  138.284481] XFS (dm-1): Mounting V5 Filesystem c3b0a09a-f960-464f-a741-84be44d55da0
[  138.679732] XFS (dm-1): Ending clean mount
[  138.865274] XFS (dm-1): Unmounting Filesystem c3b0a09a-f960-464f-a741-84be44d55da0
[  138.944906] XFS (dm-0): Unmounting Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[  139.833646] XFS (dm-0): Mounting V5 Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[  140.213952] XFS (dm-0): Ending clean mount
[  140.342963] run fstests xfs/259 at 2026-06-13 19:40:58
[  141.549551] loop0: detected capacity change from 0 to 8589934584
[  143.112857] loop1: detected capacity change from 0 to 8589934584
[  143.219426] Flush: task=(udev-worker)[2502] dev=loop0 state=2
[  143.240186] Teardown: task=(udev-worker)[2502] dev=loop0 state=2
[  144.575984] Flush: task=losetup[3050] dev=loop1 state=2
[  144.593206] Teardown: task=losetup[3050] dev=loop1 state=2
[  144.631137] loop0: detected capacity change from 0 to 8589934584
[  146.114670] loop1: detected capacity change from 0 to 8589934588
[  146.229225] Flush: task=(udev-worker)[2502] dev=loop0 state=2
[  146.248066] Teardown: task=(udev-worker)[2502] dev=loop0 state=2
[  147.626517] Flush: task=losetup[3080] dev=loop1 state=2
[  147.644085] Teardown: task=losetup[3080] dev=loop1 state=2
[  147.679345] loop0: detected capacity change from 0 to 8589934588
[  149.172884] loop1: detected capacity change from 0 to 8589934588
[  149.296011] Flush: task=(udev-worker)[2502] dev=loop0 state=2
[  149.315040] Teardown: task=(udev-worker)[2502] dev=loop0 state=2
[  150.631328] Flush: task=losetup[3110] dev=loop1 state=2
[  150.650082] Teardown: task=losetup[3110] dev=loop1 state=2
[  150.686210] loop0: detected capacity change from 0 to 8589934590
[  152.186234] loop1: detected capacity change from 0 to 8589934590
[  152.308623] Flush: task=(udev-worker)[2502] dev=loop0 state=2
[  152.325972] Teardown: task=(udev-worker)[2502] dev=loop0 state=2
[  153.652919] Flush: task=losetup[3140] dev=loop1 state=2
[  153.681000] Teardown: task=losetup[3140] dev=loop1 state=2
[  153.716545] loop0: detected capacity change from 0 to 8589934590
[  155.235156] do_umount: task=umount[3175] propagate_mount_busy()!=0
[  155.337055] Flush: task=(udev-worker)[2502] dev=loop0 state=2
[  155.352014] Teardown: task=(udev-worker)[2502] dev=loop0 state=2

[  182.533057] XFS (dm-1): Mounting V5 Filesystem a101f7e1-a8bd-44e2-a49f-21985ce78a3b
[  182.915714] XFS (dm-1): Ending clean mount
[  183.114481] XFS (dm-1): Unmounting Filesystem a101f7e1-a8bd-44e2-a49f-21985ce78a3b
[  183.190809] XFS (dm-0): Unmounting Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[  184.064717] XFS (dm-0): Mounting V5 Filesystem a80d3f5c-f068-4b64-bfe4-543a811c5e93
[  184.456110] XFS (dm-0): Ending clean mount
[  184.592063] run fstests xfs/259 at 2026-06-13 19:41:42
[  185.926189] loop0: detected capacity change from 0 to 8589934584
[  187.473922] loop1: detected capacity change from 0 to 8589934584
[  187.584471] Flush: task=(udev-worker)[3531] dev=loop0 state=2
[  187.603274] Teardown: task=(udev-worker)[3531] dev=loop0 state=2
[  188.961285] Flush: task=losetup[4079] dev=loop1 state=2
[  188.990308] Teardown: task=losetup[4079] dev=loop1 state=2
[  189.029614] loop0: detected capacity change from 0 to 8589934584
[  190.521249] loop1: detected capacity change from 0 to 8589934588
[  190.631200] Flush: task=(udev-worker)[4075] dev=loop0 state=2
[  190.645245] Teardown: task=(udev-worker)[4075] dev=loop0 state=2
[  192.023674] Flush: task=losetup[4109] dev=loop1 state=2
[  192.035269] Teardown: task=losetup[4109] dev=loop1 state=2
[  192.074917] loop0: detected capacity change from 0 to 8589934588
[  193.606595] loop1: detected capacity change from 0 to 8589934588
[  193.727944] Flush: task=(udev-worker)[4075] dev=loop0 state=2
[  193.747188] Teardown: task=(udev-worker)[4075] dev=loop0 state=2
[  195.075694] Flush: task=losetup[4140] dev=loop1 state=2
[  195.092257] Teardown: task=losetup[4140] dev=loop1 state=2
[  195.128969] loop0: detected capacity change from 0 to 8589934590
[  196.636772] loop1: detected capacity change from 0 to 8589934590
[  196.758941] Flush: task=(udev-worker)[4075] dev=loop0 state=2
[  196.779155] Teardown: task=(udev-worker)[4075] dev=loop0 state=2
[  198.105666] Flush: task=losetup[4170] dev=loop1 state=2
[  198.128203] Teardown: task=losetup[4170] dev=loop1 state=2
[  198.163723] loop0: detected capacity change from 0 to 8589934590
[  199.685651] do_umount: task=umount[4205] propagate_mount_busy()!=0
[  199.787468] Flush: task=(udev-worker)[4075] dev=loop0 state=2
[  199.804123] Teardown: task=(udev-worker)[4075] dev=loop0 state=2
--------------------------------------------------------------------------------

That is, if someone else (e.g. udev-worker) by chance has a file descriptor of a loop device
when "losetup -d" called lo_release(), __loop_clr_fd() is not called by "losetup" due to
commit 18048c1af783 ("loop: Fix a race between loop detach and loop open"). Therefore,
I consider that this teardown operation is racy regardless of whether disk->open_mutex is
temporarily released or not. Actually, I can reproduce this problem with below change
(lockdep warning aside).

------------------------------------------------------------
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c3b607a3ddc4..076207efb1cc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1778,7 +1778,7 @@ static void lo_release(struct gendisk *disk)
 		 * the Lo_rundown state guarantees that lo_open() will fail with -ENXIO.
 		 * Thus, there will be effectively no change caused by this violation.
 		 */
-		mutex_unlock(&lo->lo_disk->open_mutex);
+		//mutex_unlock(&lo->lo_disk->open_mutex);
 		/*
 		 * Now that loop_queue_rq() sees lo->lo_state != Lo_bound,
 		 * wait for already started loop_queue_rq() to complete.
@@ -1810,7 +1810,7 @@ static void lo_release(struct gendisk *disk)
 		 * released disk->open_mutex, for I am the only and the last user of
 		 * this loop device because lo_open() cannot succeed.
 		 */
-		mutex_lock(&lo->lo_disk->open_mutex);
+		//mutex_lock(&lo->lo_disk->open_mutex);
 		if (WARN_ON(data_race(READ_ONCE(lo->lo_state)) != Lo_rundown))
 			return;
 		__loop_clr_fd(lo);
------------------------------------------------------------

------------------------------------------------------------
FSTYP         -- xfs (non-debug)
PLATFORM      -- Linux/x86_64 localhost 7.1.0-rc7-next-20260608-dirty #23 SMP PREEMPT_DYNAMIC Sat Jun 13 19:19:37 JST 2026
MKFS_OPTIONS  -- -f /dev/mapper/delayed_scratch
MOUNT_OPTIONS -- /dev/mapper/delayed_scratch /home/scratch

xfs/259 61s ... umount: /home/test: target is busy.
_check_xfs_filesystem: filesystem on /dev/mapper/delayed_test has dirty log
(see /var/lib/xfstests/results//xfs/259.full for details)
_check_xfs_filesystem: filesystem on /dev/mapper/delayed_test is inconsistent (r)
(see /var/lib/xfstests/results//xfs/259.full for details)
Trying to repair broken TEST_DEV file system
_repair_test_fs: failed, err=1
(see /var/lib/xfstests/results//xfs/259.full for details)

Ran: xfs/259
Failures: xfs/259
Failed 1 of 1 tests

------------------------------------------------------------

Therefore, although commit fb1d5846e99c ("loop: Fix NULL pointer dereference in
lo_rw_aio()") might have widened the race window, I think that this is a problem
which should be addressed by updating the "umount" user.


^ permalink raw reply related

* Re: [PATCH v3] rust: add procedural macro for declaring configfs attributes
From: Miguel Ojeda @ 2026-06-13 10:41 UTC (permalink / raw)
  To: Malte Wechter
  Cc: Andreas Hindborg, Breno Leitao, Miguel Ojeda, Boqun Feng,
	Gary Guo, Björn Roy Baron, Benno Lossin, Alice Ryhl,
	Trevor Gross, Danilo Krummrich, Jens Axboe, Luis Chamberlain,
	Petr Pavlu, Daniel Gomez, Sami Tolvanen, Aaron Tomlin,
	linux-kernel, rust-for-linux, linux-block, linux-modules
In-Reply-To: <20260612-configfs-syn-v3-1-3292fbc5cc32@gmail.com>

Hi Malte,

Some quick notes...

On Fri, Jun 12, 2026 at 3:29 PM Malte Wechter <maltewechter@gmail.com> wrote:
>
> +/// ```ignore

Empty /// before examples.

> +///     // This will extract "foo: <field>" into a variable named "foo".

` instead of "

i.e. please use Markdown

> +///```

Missing space indentation

> +/// Expands the following output:
> +///    let item_type = {

Missing example block, both at the beginning and the end.

Please double-check by generating the docs and looking at how they
appear in the browser.

The prefix of the title should likely be `rust: configfs:`.

Thanks!

Cheers,
Miguel

^ permalink raw reply

* Re: [PATCH] nbd: Reclassify sockets to avoid lockdep circular dependency
From: Hillf Danton @ 2026-06-13 10:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, Jens Axboe, linux-block, nbd, Kuniyuki Iwashima,
	netdev, syzbot+607cdcf978b3e79da878
In-Reply-To: <20260613042619.1108126-1-edumazet@google.com>

On Sat, 13 Jun 2026 04:26:19 +0000 Eric Dumazet wrote:
> syzbot reported a possible circular locking dependency in udp_sendmsg()
> where fs_reclaim can be triggered while holding sk_lock, and fs_reclaim
> can eventually depend on another sk_lock (e.g., if NBD is used for swap
> or writeback and NBD uses TLS/TCP which acquires sk_lock).
> 
> Since the UDP socket and the NBD TCP/TLS socket are different, this is a
> false positive. Fix this by reclassifying NBD sockets to a separate lock
> class when they are added to the NBD device.
> 
> This is similar to what nvme-tcp and other network block devices do.
> 
> Fixes: ffa1e7ada456 ("block: Make request_queue lockdep splats show up earlier")

Given the Fixes tag, can you specify anything wrong that commit added?

> Reported-by: syzbot+607cdcf978b3e79da878@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/6a2cdafe.428ffe26.258b27.0161.GAE@google.com/T/#u
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---

^ permalink raw reply

* [PATCH] nbd: Reclassify sockets to avoid lockdep circular dependency
From: Eric Dumazet @ 2026-06-13  4:26 UTC (permalink / raw)
  To: Josef Bacik, Jens Axboe
  Cc: linux-kernel, linux-block, nbd, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Kuniyuki Iwashima, netdev, Eric Dumazet,
	syzbot+607cdcf978b3e79da878

syzbot reported a possible circular locking dependency in udp_sendmsg()
where fs_reclaim can be triggered while holding sk_lock, and fs_reclaim
can eventually depend on another sk_lock (e.g., if NBD is used for swap
or writeback and NBD uses TLS/TCP which acquires sk_lock).

Since the UDP socket and the NBD TCP/TLS socket are different, this is a
false positive. Fix this by reclassifying NBD sockets to a separate lock
class when they are added to the NBD device.

This is similar to what nvme-tcp and other network block devices do.

Fixes: ffa1e7ada456 ("block: Make request_queue lockdep splats show up earlier")
Reported-by: syzbot+607cdcf978b3e79da878@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6a2cdafe.428ffe26.258b27.0161.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/block/nbd.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fe63f3c55d0d960a1a4bbb2c60738cbbece10719..0e2180e910c4eaaa58556a0c75c1b9c3fdc1930d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1238,6 +1238,42 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
 	return sock;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key nbd_key[3];
+static struct lock_class_key nbd_slock_key[3];
+
+static void nbd_reclassify_socket(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+		return;
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET-NBD",
+					      &nbd_slock_key[0],
+					      "sk_lock-AF_INET-NBD",
+					      &nbd_key[0]);
+		break;
+	case AF_INET6:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NBD",
+					      &nbd_slock_key[1],
+					      "sk_lock-AF_INET6-NBD",
+					      &nbd_key[1]);
+		break;
+	case AF_UNIX:
+		sock_lock_init_class_and_name(sk, "slock-AF_UNIX-NBD",
+					      &nbd_slock_key[2],
+					      "sk_lock-AF_UNIX-NBD",
+					      &nbd_key[2]);
+		break;
+	}
+}
+#else
+static inline void nbd_reclassify_socket(struct socket *sock) {}
+#endif
+
 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 			  bool netlink)
 {
@@ -1254,6 +1290,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 	sock = nbd_get_socket(nbd, arg, &err);
 	if (!sock)
 		return err;
+	nbd_reclassify_socket(sock);
 
 	/*
 	 * We need to make sure we don't get any errant requests while we're
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

* Re: [PATCH v2 02/14] firewire: core: Open-code topology list walk
From: Takashi Sakamoto @ 2026-06-13  2:11 UTC (permalink / raw)
  To: Kaitao Cheng
  Cc: Andy Shevchenko, Muchun Song, Philipp Reisner, Lars Ellenberg,
	Christoph Böhmwalder, Jens Axboe, Andrzej Hajda,
	Neil Armstrong, Robert Foss, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Jani Nikula,
	Joonas Lahtinen, Rodrigo Vivi, Tvrtko Ursulin, Christian Koenig,
	Huang Rui, Eddie James, Mark Brown, Maxime Coquelin,
	Alexandre Torgue, Laxman Dewangan, Thierry Reding,
	Jonathan Hunter, Sowjanya Komatineni, Davidlohr Bueso,
	Paul E . McKenney, Josh Triplett, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Boqun Feng, Liam Girdwood, Jaroslav Kysela,
	Takashi Iwai, Laurent Pinchart, Jonas Karlman, Jernej Skrabec,
	Matthew Auld, Matthew Brost, Waiman Long, drbd-dev, linux-block,
	linux1394-devel, dri-devel, intel-gfx, linux-spi, linux-stm32,
	linux-arm-kernel, linux-tegra, linux-sound, linux-kernel,
	Andrew Morton, Randy Dunlap, Christian Brauner, David Howells,
	Luca Ceresoli, Kaitao Cheng
In-Reply-To: <20260609061347.93688-3-kaitao.cheng@linux.dev>

Hi,

On Tue, Jun 09, 2026 at 02:13:35PM +0800, Kaitao Cheng wrote:
> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> 
> A later change will make list_for_each_entry() cache the next element
> before entering the loop body. for_each_fw_node() intentionally appends
> newly discovered child nodes to the temporary walk list while the list is
> being traversed.
> 
> Keep the loop open-coded so the next node is looked up only after
> children have been appended. This preserves the current breadth-first
> traversal semantics and prepares the code for the list iterator update.
> 
> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> ---
>  drivers/firewire/core-topology.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Thanks for the patch.

Last September I've realized the issue but not solved yet[1]. A pointer
array would be another candidate to store the found nodes, since IEEE 1394
bus a restriction about the maximum number of nodes up to 256. But It is
too large if put in kernel stack, while it is slightly difficult to keep
it in heap dynamically since the function is called under holding
spinning lock.

Anyway, there is no objection to your change. Let me apply it to
for-next branch so that your further work goes well with no blocks
locating in this subsystem.

[1] https://social.kernel.org/notice/AyDqvLkpwUvI5eyokK


Thanks

Takashi Sakamoto

^ permalink raw reply

* [PATCH net v2 2/2] vsock/virtio: restore msg_iter on transmission failure
From: Octavian Purdila @ 2026-06-13  0:09 UTC (permalink / raw)
  To: netdev
  Cc: Alexander Viro, Andrew Morton, Arseniy Krasnov, David S. Miller,
	Eric Dumazet, Eugenio Pérez, Jakub Kicinski, Jason Wang, kvm,
	linux-block, linux-fsdevel, linux-kernel, Michael S. Tsirkin,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Stefano Garzarella,
	virtualization, Xuan Zhuo, Octavian Purdila,
	syzbot+28e5f3d207b14bae122a
In-Reply-To: <20260613000953.467473-1-tavip@google.com>

When transmission fails in virtio_transport_send_pkt_info, the msg_iter
might have been partially advanced. If we don't restore it, the next
attempt to send data will use an incorrect iterator state, leading to
desync and warnings like "send_pkt() returns 0, but X expected".

Specifically, this can happen in the following scenario, triggered by
the syzkaller repro:

1. A write-only VMA (PROT_WRITE only) is partially populated by a
   prior TUN write that failed with -EIO but still faulted in some
   pages).
2. A vsock sendmmsg call with MSG_ZEROCOPY requests transmission of a
   buffer from this VMA.
3. The first packet (64KB) is sent successfully because the pages are
   populated.
4. The second packet allocation fails because GUP fast pins the first page
   but GUP slow fails on the next unpopulated page due to PROT_WRITE-only
   permissions.
5. The iterator is advanced by the partially successful GUP (68KB total
   advanced: 64KB from first packet + 4KB from second), but the send loop
   breaks and only reports 64KB sent. This creates a 4KB desync.
6. The next retry starts with a non-zero iov_offset, disabling zerocopy
   and falling back to copy mode.
7. In copy mode, the transmission succeeds for the next packets but
   exhausts the iterator early because of the desync.
8. The final retry sees an empty iterator but zerocopy is re-enabled
   (offset resets). It attempts to send the remaining bytes with zerocopy
   but pins 0 pages, creating an empty packet.
9. The transport sends the empty packet, triggering the warning because
   the returned bytes (header only) do not match the expected payload size.
10. The loop continues to spin, allocating ubuf_info each time, eventually
    exhausting sysctl_optmem_max and returning -ENOMEM to userspace.

Restore msg_iter to its original state before the packet allocation
and transmission attempt if they fail.

Fixes: e0718bd82e27 ("vsock: enable setting SO_ZEROCOPY")
Reported-by: syzbot+28e5f3d207b14bae122a@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=28e5f3d207b14bae122a
Assisted-by: gemini:gemini-3.1-pro
Signed-off-by: Octavian Purdila <tavip@google.com>
---
 net/vmw_vsock/virtio_transport_common.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index b10666937c490..2baa5a6ebd750 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -295,6 +295,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
 	u32 src_cid, src_port, dst_cid, dst_port;
 	const struct virtio_transport *t_ops;
+	struct iov_iter_state msg_iter_state;
 	struct virtio_vsock_sock *vvs;
 	struct ubuf_info *uarg = NULL;
 	u32 pkt_len = info->pkt_len;
@@ -368,8 +369,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 		struct sk_buff *skb;
 		size_t skb_len;
 
+		/* Save iterator state in case allocation or transmission fails
+		 * so we can restore it and retry.
+		 */
+		if (info->msg)
+			iov_iter_save_state(&info->msg->msg_iter, &msg_iter_state);
+
 		skb_len = min(max_skb_len, rest_len);
 
+		/* Note: virtio_transport_alloc_skb() can advance info->msg->msg_iter
+		 * even if it fails (e.g. partial GUP success).
+		 */
 		skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
 						 uarg,
 						 src_cid, src_port,
@@ -399,6 +409,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 			break;
 	} while (rest_len);
 
+	if (info->msg && ret < 0)
+		iov_iter_restore(&info->msg->msg_iter, &msg_iter_state);
+
 	virtio_transport_put_credit(vvs, rest_len);
 
 	/* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1.
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox