Linux block layer
 help / color / mirror / Atom feed
* [PATCHv2 4/6] loop: set dma_alignment from the backing file for direct I/O
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

Direct I/O user pages are forwarded to the backing file unchanged, so
the backing's DMA alignment requirement applies to them. Track the
backing's dio_mem_align and advertise it as the loop device's
dma_alignment so we advertise proper limits and misaligned I/O is
rejected here instead of being dispatched to the backend.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/block/loop.c | 50 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 310de0463beb1..7114f80ab162a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -54,6 +54,7 @@ struct loop_device {
 
 	struct file	*lo_backing_file;
 	unsigned int	lo_min_dio_size;
+	unsigned int	lo_dio_mem_align;
 	struct block_device *lo_device;
 
 	gfp_t		old_gfp_mask;
@@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo)
 			__func__, lo->lo_number, lo->lo_file_name, rc);
 }
 
-static unsigned int loop_query_min_dio_size(struct loop_device *lo)
+static void loop_update_dio_alignment(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
 	struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
 	struct kstat st;
 
 	/*
-	 * Use the minimal dio alignment of the file system if provided.
+	 * Use the dio alignment of the file system if provided.  dio_offset_align
+	 * is the minimum dio size and offset; dio_mem_align is the buffer memory
+	 * alignment, kept as a mask to become the loop device's dma_alignment in
+	 * direct I/O mode where the buffer is handed to the backing file unchanged.
 	 */
 	if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
-	    (st.result_mask & STATX_DIOALIGN))
-		return st.dio_offset_align;
+	    (st.result_mask & STATX_DIOALIGN)) {
+		lo->lo_min_dio_size = st.dio_offset_align;
+		lo->lo_dio_mem_align = st.dio_mem_align - 1;
+		return;
+	}
 
 	/*
 	 * In a perfect world this wouldn't be needed, but as of Linux 6.13 only
 	 * a handful of file systems support the STATX_DIOALIGN flag.
 	 */
-	if (sb_bdev)
-		return bdev_logical_block_size(sb_bdev);
-	return SECTOR_SIZE;
+	if (sb_bdev) {
+		lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev);
+		lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev);
+		return;
+	}
+
+	lo->lo_min_dio_size = SECTOR_SIZE;
+	lo->lo_dio_mem_align = SECTOR_SIZE - 1;
 }
 
 static inline int is_loop_device(struct file *file)
@@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
 			lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
 	if (lo->lo_backing_file->f_flags & O_DIRECT)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
-	lo->lo_min_dio_size = loop_query_min_dio_size(lo);
+	loop_update_dio_alignment(lo);
 }
 
 static int loop_check_backing_file(struct file *file)
@@ -961,6 +973,17 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
 	lim->logical_block_size = bsize;
 	lim->physical_block_size = bsize;
 	lim->io_min = bsize;
+	/*
+	 * In direct I/O the user pages are handed to the backing file as-is, so
+	 * the backing's DMA alignment requirement applies to them.  Advertise it
+	 * so misaligned I/O is rejected at this device's entry instead of being
+	 * dispatched to the backend.  Buffered I/O copies through the page cache
+	 * and imposes no such requirement.
+	 */
+	if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+		lim->dma_alignment = lo->lo_dio_mem_align;
+	else
+		lim->dma_alignment = SECTOR_SIZE - 1;
 	lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
 	if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
 		lim->features |= BLK_FEAT_WRITE_CACHE;
@@ -1416,6 +1439,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 {
 	bool use_dio = !!arg;
 	unsigned int memflags;
+	struct queue_limits lim;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1434,6 +1458,16 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 	else
 		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+	/*
+	 * Direct I/O forwards the user pages to the backing file unchanged, so
+	 * track the backing's DMA alignment requirement as the mode is toggled.
+	 */
+	lim = queue_limits_start_update(lo->lo_queue);
+	if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+		lim.dma_alignment = lo->lo_dio_mem_align;
+	else
+		lim.dma_alignment = SECTOR_SIZE - 1;
+	queue_limits_commit_update(lo->lo_queue, &lim);
 	blk_mq_unfreeze_queue(lo->lo_queue, memflags);
 	return 0;
 }
-- 
2.52.0


^ permalink raw reply related

* [PATCHv2 5/6] zloop: set dma_alignment from the backing files for direct I/O
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

Direct I/O request's use pages handed to the backing files unchanged, so
the backing's DMA alignment requirement applies. Track dio_mem_align and
advertise it as the device's dma_alignment so we communicate proper
limits and misaligned I/O is rejected here instead of reaching the
backend.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/block/zloop.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index 55eeb6aac0ea3..1149b817b5bc9 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -144,6 +144,7 @@ struct zloop_device {
 	unsigned int		nr_conv_zones;
 	unsigned int		max_open_zones;
 	unsigned int		block_size;
+	unsigned int		dio_mem_align;
 
 	spinlock_t		open_zones_lock;
 	struct list_head	open_zones_lru_list;
@@ -1035,6 +1036,9 @@ static int zloop_get_block_size(struct zloop_device *zlo,
 {
 	struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
 	struct kstat st;
+	bool have_dioalign = !vfs_getattr(&zone->file->f_path, &st,
+					  STATX_DIOALIGN, 0) &&
+			     (st.result_mask & STATX_DIOALIGN);
 
 	/*
 	 * If the FS block size is lower than or equal to 4K, use that as the
@@ -1044,14 +1048,25 @@ static int zloop_get_block_size(struct zloop_device *zlo,
 	 */
 	if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
 		zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
-	else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
-		 (st.result_mask & STATX_DIOALIGN))
+	else if (have_dioalign)
 		zlo->block_size = st.dio_offset_align;
 	else if (sb_bdev)
 		zlo->block_size = bdev_physical_block_size(sb_bdev);
 	else
 		zlo->block_size = SECTOR_SIZE;
 
+	/*
+	 * In direct I/O the request's pages are handed to the backing files
+	 * unchanged, so track their required memory alignment as a mask for
+	 * dma_alignment.
+	 */
+	if (have_dioalign)
+		zlo->dio_mem_align = st.dio_mem_align - 1;
+	else if (sb_bdev)
+		zlo->dio_mem_align = bdev_dma_alignment(sb_bdev);
+	else
+		zlo->dio_mem_align = SECTOR_SIZE - 1;
+
 	if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
 		pr_err("Zone capacity is not aligned to block size %u\n",
 		       zlo->block_size);
@@ -1279,6 +1294,9 @@ static int zloop_ctl_add(struct zloop_options *opts)
 
 	lim.physical_block_size = zlo->block_size;
 	lim.logical_block_size = zlo->block_size;
+	/* Direct I/O hands the request's pages to the backing files unchanged. */
+	if (!opts->buffered_io)
+		lim.dma_alignment = zlo->dio_mem_align;
 	if (zlo->zone_append)
 		lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
 	lim.max_open_zones = zlo->max_open_zones;
-- 
2.52.0


^ permalink raw reply related

* [PATCHv2 2/6] block: report the actual status
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

Rather than assume EIO, set the actual reported status for user space
informational purposes.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/fops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/fops.c b/block/fops.c
index 15783a6180dec..f237d6cab8975 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -218,7 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
 		if (unlikely(ret)) {
-			bio_endio_status(bio, BLK_STS_IOERR);
+			bio_endio_errno(bio, ret);
 			break;
 		}
 		if (iocb->ki_flags & IOCB_NOWAIT) {
-- 
2.52.0


^ permalink raw reply related

* [PATCHv2 6/6] block: validate user space vectors during extraction
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch, stable
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

The bio-based drivers don't necessarily check the alignment split, and
stacking block drivers don't always handle a misalignment detected after
submitting the bio. Validate user vectors against the device's
dma_alignment as the bio is built from the iov_iter, rejecting
misaligned early with -EINVAL.

Cc: stable@vger.kernel.org
Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c          | 50 +++++++++++++++++++++++++++++++++++++++++---
 block/blk-map.c      |  2 +-
 block/fops.c         |  1 +
 fs/iomap/direct-io.c |  1 +
 include/linux/bio.h  |  2 +-
 include/linux/uio.h  |  3 ++-
 lib/iov_iter.c       |  9 +++++++-
 7 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index f2a5f4d0a9672..4360149d4eba2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1220,10 +1220,39 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_KERNEL
+static inline bool bio_iov_bvec_aligned(const struct bio *bio,
+					unsigned mem_align_mask)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	for_each_mp_bvec(bv, bio->bi_io_vec, iter, bio->bi_iter)
+		if ((bv.bv_offset | bv.bv_len) & mem_align_mask)
+			return false;
+	return true;
+}
+#else
+static inline bool bio_iov_bvec_aligned(const struct bio *bio,
+					unsigned mem_align_mask)
+{
+	/*
+	 * The vectors are owned and laid out by the caller; we only forward
+	 * them. Most callers are already aligned, but io_uring can place a
+	 * user chosen offset through a registered buffer, where only the first
+	 * vector may be unaligned.
+	 */
+	return !(mp_bvec_iter_offset(bio->bi_io_vec, bio->bi_iter) &
+							mem_align_mask);
+}
+#endif
+
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
  * @iter: iov iterator describing the region to be added
+ * @mem_align_mask: the mask the source address and length must be aligned to,
+ *	0 for no requirement
  * @len_align_mask: the mask to align the total size to, 0 for any length
  *
  * This takes either an iterator pointing to user memory, or one pointing to
@@ -1242,7 +1271,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
  * is returned only if 0 pages could be pinned.
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
-			   unsigned len_align_mask)
+			   unsigned mem_align_mask, unsigned len_align_mask)
 {
 	iov_iter_extraction_t flags = 0;
 
@@ -1251,6 +1280,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 
 	if (iov_iter_is_bvec(iter)) {
 		bio_iov_bvec_set(bio, iter);
+
+		if (!bio_iov_bvec_aligned(bio, mem_align_mask))
+			return -EINVAL;
+
 		iov_iter_advance(iter, bio->bi_iter.bi_size);
 		return 0;
 	}
@@ -1265,8 +1298,19 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 
 		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
 				BIO_MAX_SIZE - bio->bi_iter.bi_size,
-				&bio->bi_vcnt, bio->bi_max_vecs, flags);
+				&bio->bi_vcnt, bio->bi_max_vecs,
+				mem_align_mask, flags);
 		if (ret <= 0) {
+			/*
+			 * A misaligned vector fails the whole I/O.  Release any
+			 * pages pinned by earlier iterations before returning
+			 * since this bio won't be submitted to release them.
+			 */
+			if (ret == -EINVAL) {
+				bio_release_pages(bio, false);
+				bio_clear_flag(bio, BIO_PAGE_PINNED);
+				bio->bi_vcnt = 0;
+			}
 			if (!bio->bi_vcnt)
 				return ret;
 			break;
@@ -1377,7 +1421,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
 		ssize_t ret;
 
 		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
-				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
+				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0, 0);
 		if (ret <= 0) {
 			if (!bio->bi_vcnt) {
 				folio_put(folio);
diff --git a/block/blk-map.c b/block/blk-map.c
index 768549f19f97e..c9535efe1a913 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -274,7 +274,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 	 * No alignment requirements on our part to support arbitrary
 	 * passthrough commands.
 	 */
-	ret = bio_iov_iter_get_pages(bio, iter, 0);
+	ret = bio_iov_iter_get_pages(bio, iter, 0, 0);
 	if (ret)
 		goto out_put;
 	ret = blk_rq_append_bio(rq, bio);
diff --git a/block/fops.c b/block/fops.c
index b5c320da28123..84eeabd97e1f0 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -47,6 +47,7 @@ static inline int blkdev_iov_iter_get_pages(struct bio *bio,
 		struct iov_iter *iter, struct block_device *bdev)
 {
 	return bio_iov_iter_get_pages(bio, iter,
+			bdev_dma_alignment(bdev),
 			bdev_logical_block_size(bdev) - 1);
 }
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b485e3b191daf..ff458aa12ae29 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -358,6 +358,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 				iomap_max_bio_size(&iter->iomap), alignment);
 	else
 		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
+					     bdev_dma_alignment(bio->bi_bdev),
 					     alignment - 1);
 	if (unlikely(ret))
 		goto out_put_bio;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8f33f717b14f5..ce34ea49ef358 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -477,7 +477,7 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
 		size_t len, enum req_op op);
 
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
-		unsigned len_align_mask);
+		unsigned mem_align_mask, unsigned len_align_mask);
 
 void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
 void __bio_release_pages(struct bio *bio, bool mark_dirty);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index a9bc5b3067e32..653dee76c0b33 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -391,7 +391,8 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
 			       size_t *offset0);
 ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
 		size_t max_size, unsigned short *nr_vecs,
-		unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
+		unsigned short max_vecs, unsigned mem_align_mask,
+		iov_iter_extraction_t extraction_flags);
 
 /**
  * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 273919b161617..8d5ca3e38522a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1886,6 +1886,8 @@ static unsigned int get_contig_folio_len(struct page **pages,
  * @max_size:	maximum size to extract from @iter
  * @nr_vecs:	number of vectors in @bv (on in and output)
  * @max_vecs:	maximum vectors in @bv, including those filled before calling
+ * @mem_align_mask:	reject with -EINVAL if the source address or length is not
+ *		aligned to this mask
  * @extraction_flags: flags to qualify request
  *
  * Like iov_iter_extract_pages(), but returns physically contiguous ranges
@@ -1897,14 +1899,19 @@ static unsigned int get_contig_folio_len(struct page **pages,
  */
 ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
 		size_t max_size, unsigned short *nr_vecs,
-		unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
+		unsigned short max_vecs, unsigned mem_align_mask,
+		iov_iter_extraction_t extraction_flags)
 {
+	unsigned long start = (unsigned long)iter_iov_addr(iter);
 	unsigned short entries_left = max_vecs - *nr_vecs;
 	unsigned short nr_pages, i = 0;
 	size_t left, offset, len;
 	struct page **pages;
 	ssize_t size;
 
+	if ((start | iter_iov_len(iter)) & mem_align_mask)
+		return -EINVAL;
+
 	/*
 	 * Move page array up in the allocated memory for the bio vecs as far as
 	 * possible so that we can start filling biovecs from the beginning
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v4 0/3] crypto: skcipher - per-request multi-data-unit batching
From: Eric Biggers @ 2026-06-22 18:23 UTC (permalink / raw)
  To: Leonid Ravich
  Cc: Herbert Xu, Alasdair Kergon, Ard Biesheuvel, Jens Axboe, dm-devel,
	linux-block
In-Reply-To: <20260622071044.4079-1-lravich@amazon.com>

On Mon, Jun 22, 2026 at 07:10:44AM +0000, Leonid Ravich wrote:
> On Mon, Jun 15, 2026 at 03:53:17PM -0700, Eric Biggers wrote:
> > So in other words, this series slows down dm-crypt and crypto_skcipher
> > for everyone to optimize for an out-of-tree driver.  And there's also no
> > benchmark showing that your driver is even worth it over just using the
> > CPU.
> 
> I measured on arm64 (Graviton3, dm-crypt + xts-aes-ce, RAM-backed,
> fixed CPU freq):
> 
>   - 4 KiB random write, 512-byte sectors: v4 as posted regressed ~5%.
>     Root cause (ftrace): a per-bio kmalloc_array() for the scatterlists,
>     where the per-sector path uses dm-crypt's inline sg_in[]/sg_out[].
> 
>   - Reusing the inline arrays when the segment count fits (heap only for
>     larger bios) removes the regression, back to parity. This will be in
>     the dm-crypt patch for v5.
> 
> So the software path is neutral after the fix, not slower. No software throughput win
> either: the auto-splitter still calls alg->encrypt per data unit. The win
> is for a consumer that takes the whole request in one pass, a HW engine,
> or any async offload engine that pays a fixed per-request cost,
> it currently pays once per sector instead of once per bio.
> 
> I'd rather not over-complicate the patches until there's a general
> ack on the direction: per-request data_unit_size + auto-split,
> enabling one-pass consumers, neutral for everyone else. Is that direction
> acceptable? If so I'll respin v5.

I don't think there's a path forward without an in-tree user that's
shown to be worthwhile over just using the acceleration built directly
into the CPU.  As well as confirmation of no regression to existing
users, including in cases where the inline sg list can't be used.

- Eric

^ permalink raw reply

* Re: [PATCH blktests] scsi/009: fix unset bytes_to_write in TEST 8
From: Shin'ichiro Kawasaki @ 2026-06-22 21:30 UTC (permalink / raw)
  To: Sebastian Chlad; +Cc: linux-block, Sebastian Chlad, alan.adamson
In-Reply-To: <ajIhtkOIMXeM6BAI@shinmob>

On Jun 17, 2026 / 13:29, Shin'ichiro Kawasaki wrote:
> CC+ Alan,
> 
> On Jun 14, 2026 / 20:16, Sebastian Chlad wrote:
> > bytes_to_write was never assigned before TEST 8, causing it to pass for
> > the wrong reason. Set it to atomic_unit_max_bytes + logical_block_size
> > and update the golden output with the expected "pwrite: Invalid argument"
> > from xfs_io.
> > 
> > Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
> 
> Thanks. The change looks good to me.
> 
> I will wait a few more days just in case anyone has opinion on the change.
> FYI: Sebastian posted a similar change for nvme/059 [*].
> 
> [*] https://github.com/linux-blktests/blktests/pull/245

I applied the patch. Thanks!

^ permalink raw reply

* Re: [PATCH] block: fix incorrect error injection static key decrement
From: Jens Axboe @ 2026-06-22 22:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: dlemoal, linux-block
In-Reply-To: <20260622160752.1552516-1-hch@lst.de>


On Mon, 22 Jun 2026 18:07:52 +0200, Christoph Hellwig wrote:
> Only decrement the static key when we had items and thus it was
> incremented before.

Applied, thanks!

[1/1] block: fix incorrect error injection static key decrement
      commit: 214cdae69dba9bb1fc0b517b7fb97bab385a2e3a

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH] block, bfq: protect async queue reset with blkcg locks
From: Jens Axboe @ 2026-06-22 22:00 UTC (permalink / raw)
  To: Yu Kuai, Tejun Heo, Josef Bacik, Arianna Avanzini, Paolo Valente,
	Cen Zhang
  Cc: linux-block, cgroups, linux-kernel, baijiaju1990
In-Reply-To: <20260621135930.2657810-1-zzzccc427@gmail.com>


On Sun, 21 Jun 2026 21:59:30 +0800, Cen Zhang wrote:
> Writing 0 to BFQ's low_latency attribute ends weight raising for active,
> idle and async queues. The async cgroup path walks q->blkg_list, converts
> each blkg to BFQ policy data and then reads bfqg->async_bfqq and
> bfqg->async_idle_bfqq.
> 
> That walk was protected only by bfqd->lock. blkcg release work is
> serialized by q->blkcg_mutex and q->queue_lock instead, and
> blkg_free_workfn() can call BFQ's pd_free_fn before it removes
> blkg->q_node from q->blkg_list. A low_latency reset can therefore still
> find the blkg on the queue list after the BFQ policy data has been freed.
> 
> [...]

Applied, thanks!

[1/1] block, bfq: protect async queue reset with blkcg locks
      commit: 17b2d950a3c0328ed749476e6118ca869b3ca8b5

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH] nbd: don't warn when reclassifying a busy socket lock
From: Jens Axboe @ 2026-06-22 22:00 UTC (permalink / raw)
  To: josef, edumazet, Deepanshu Kartikey
  Cc: linux-block, nbd, linux-kernel, syzbot+6b85d1e39a5b8ed9a954
In-Reply-To: <20260621235255.66015-1-kartikey406@gmail.com>


On Mon, 22 Jun 2026 05:22:55 +0530, Deepanshu Kartikey wrote:
> nbd_reclassify_socket() warns via WARN_ON_ONCE() if the socket lock is
> held at the point of reclassification. That assertion was copied from
> nvme-tcp, where the socket is created internally by the kernel
> (sock_create_kern()) and is never visible to user space, so the lock
> is guaranteed to be free.
> 
> NBD is different: the socket is looked up from a user-supplied fd in
> nbd_get_socket(), and user space retains that fd. A concurrent syscall
> on the same socket (or softirq processing taking bh_lock_sock() on a
> connected TCP socket) can legitimately hold the lock at the instant
> NBD reclassifies it. sock_allow_reclassification() then returns false
> and the WARN_ON_ONCE() fires, which turns into a crash under
> panic_on_warn. This is reachable by simply racing NBD_CMD_CONNECT
> against socket activity on the same fd, as reported by syzbot.
> 
> [...]

Applied, thanks!

[1/1] nbd: don't warn when reclassifying a busy socket lock
      commit: 9280e6edf65662b6aafc8b704ad065b54c08b519

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH V2] blk-cgroup: fix UAF in __blkcg_rstat_flush()
From: Jens Axboe @ 2026-06-22 22:01 UTC (permalink / raw)
  To: linux-block, Ming Lei
  Cc: Michal Koutný, stable, Jay Shin, Tejun Heo, Waiman Long,
	coregee2000
In-Reply-To: <20260205155425.342084-1-ming.lei@redhat.com>


On Thu, 05 Feb 2026 23:54:23 +0800, Ming Lei wrote:
> When multiple blkgs in the same blkcg are released concurrently,
> a use-after-free can occur. The race happens when one blkg's
> __blkcg_rstat_flush() removes another blkg's iostat entries via
> llist_del_all(). The second blkg sees an empty list and proceeds
> to free itself while the first is still iterating over its entries.
> 
> Move the flush from __blkg_release() (RCU callback) to blkg_release()
> (before call_rcu). This ensures the RCU grace period waits for any
> concurrent flush's rcu_read_lock() section to complete before freeing.
> 
> [...]

Applied, thanks!

[1/1] blk-cgroup: fix UAF in __blkcg_rstat_flush()
      commit: 0ab5ee5a1badb58cbb2242617cb01a4972b1f2a2

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH V3] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
From: Jens Axboe @ 2026-06-22 22:01 UTC (permalink / raw)
  To: tj, josef, linux-block, Zizhi Wo
  Cc: cgroups, yangerkun, chengzhihao1, houtao1, yukuai
In-Reply-To: <20260616011746.2451461-1-wozizhi@huaweicloud.com>


On Tue, 16 Jun 2026 09:17:46 +0800, Zizhi Wo wrote:
> [BUG]
> Our fuzz testing triggered a blkcg use-after-free issue:
> 
>   BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0
>   Call Trace:
>   ...
>   blkcg_deactivate_policy+0x244/0x4d0
>   ioc_rqos_exit+0x44/0xe0
>   rq_qos_exit+0xba/0x120
>   __del_gendisk+0x50b/0x800
>   del_gendisk+0xff/0x190
>   ...
> 
> [...]

Applied, thanks!

[1/1] blk-cgroup: defer blkcg css_put until blkg is unlinked from queue
      commit: 3ed9b4779a4aa3f44cd9f78627498d7adac40daa

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH 1/2 blktests] src/miniublk: switch to ioctl-encoded ublk commands
From: Shin'ichiro Kawasaki @ 2026-06-22 22:21 UTC (permalink / raw)
  To: Sebastian Chlad; +Cc: Sebastian Chlad, linux-block
In-Reply-To: <CAJR+Y9K=0C+TnKAycdXbeQF98FE=RhaYYvK6SCpLPbdeMH2Xxw@mail.gmail.com>

On Jun 22, 2026 / 15:34, Sebastian Chlad wrote:
[...]
> > > diff --git a/src/miniublk.c b/src/miniublk.c
> > > index f98f850..5a35ca7 100644
> > > --- a/src/miniublk.c
> > > +++ b/src/miniublk.c
> > [...]
> > > @@ -624,9 +624,9 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
> > >               return 0;
> > >
> > >       if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
> > > -             cmd_op = UBLK_IO_COMMIT_AND_FETCH_REQ;
> > > -     else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
> > > -             cmd_op = UBLK_IO_FETCH_REQ;
> > > +             cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
> > > +     else
> > > +             cmd_op = UBLK_U_IO_FETCH_REQ;
> >
> > The hunk above changes the "else if" part, is this intentional?
> >
> 
> Yes, this is intentional because we already check things in
>     if (!(io->flags &
>         (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP)))
> which returns early if neither flag is set, so checking the first
> condition makes another check redundant as by that
> time we know we need UBLK_U_IO_FETCH_REQ.

Thanks for the explanation. Now I see your point.

> 
> However if you think it's safer to still check if io->flags &
> UBLKSRV_NEED_FETCH_RQ, I can implement it this way in the v2.
> Let me know what you prefer.

I think it's the better to keep the current

  "else if (io->flags & UBLKSRV_NEED_FETCH_RQ)"

form. Even though the change is small and will not affect the code behavior, it
is against "single purpose with single patch" guide. Anyone who looks at the
commit in future may have the same question as mine.

^ permalink raw reply

* Re: [PATCH blktests] Fix _get_page_size()
From: Shin'ichiro Kawasaki @ 2026-06-22 22:27 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Bart Van Assche, Jeff Moyer, linux-block, kch
In-Reply-To: <ajlxXfgpMQJ4qlRR@telecaster>

On Jun 22, 2026 / 10:31, Omar Sandoval wrote:
> On Mon, Jun 22, 2026 at 08:38:48PM +0900, Shin'ichiro Kawasaki wrote:
> > On Jun 20, 2026 / 09:11, Bart Van Assche wrote:
> > > On 6/20/26 6:51 AM, Shin'ichiro Kawasaki wrote:
> > > > On Jun 20, 2026 / 05:55, Bart Van Assche wrote:
> > > > > On 6/20/26 3:26 AM, Shin'ichiro Kawasaki wrote:
> > > > > > This is a rather fundamental change, so I would like to ask opinions from
> > > > > > other blktests users, especially Omar and Chaitanya. What do you think about
> > > > > > the idea to add getconf to the requirement list?
> > > > > 
> > > > > CONFIG_PAGE_SHIFT was introduced in the Linux kernel in February 2024
> > > > > (commit ba89f9c8ccba ("arch: consolidate existing CONFIG_PAGE_SIZE_*KB
> > > > > definitions")). Older kernels had CONFIG_PAGE_SIZE_4KB,
> > > > > CONFIG_PAGE_SIZE_16KB, etc. This means that it is possible to derive the
> > > > > kernel page size from the kernel configuration file for all upstream and
> > > > > distro kernels, isn't it?
> > > > 
> > > > I checked the commit is in the tag v6.9. My Debian bookworm system has kernel
> > > > v6.1, then the config file at /boot does not have CONFIG_PAGE_SHIFT as expected.
> > > > But it does not have CONFIG_PAGE_SIZE_* either... I'm still afraid that kernel
> > > > config file approach is not reliable.
> > > 
> > > Right, for older kernels CONFIG_PAGE_SIZE_*KB is only available for some
> > > but not for all supported architectures.
> > > 
> > > It is not clear to me where the desire to avoid the dependency on
> > > getconf comes from? As far as I know it is available on all Linux
> > > distro's. Since it is typically included in the C library package it
> > > should not introduce a new dependency.
> > 
> > I think less dependent is the better in general, and wanted to confirm that
> > it is fine for everybody. If there is no voice to object, I will create a
> > patch to add getconf to the requirement list.
> 
> I agree with Bart, getconf is ubiquitous enough that it's not worth
> trying to hack around its absence. In my opinion, parsing kernel config
> options should be a last resort. If anyone complains about the getconf
> dependency in the future, I think it'd be better to add a simple
> src/pagesize.c file that uses sysconf(_SC_PAGESIZE), but I don't expect
> that to be necessary.

Omar, thank you for the comment. It's good to have the plan B idea of
"src/pagesize.c". I will prepare the patch to add getconf to the
requirement list as the plan A.

^ permalink raw reply

* [PATCH net v3 0/2] vsock/virtio: fix msg_iter desync on transmission failure
From: Octavian Purdila @ 2026-06-22 22:27 UTC (permalink / raw)
  To: netdev
  Cc: Alexander Viro, Andrew Morton, Arseniy Krasnov, David S. Miller,
	Eric Dumazet, Eugenio Pérez, Jakub Kicinski, Jason Wang, kvm,
	linux-block, linux-fsdevel, linux-kernel, Michael S. Tsirkin,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Stefano Garzarella,
	virtualization, Xuan Zhuo, Jens Axboe, Octavian Purdila

This series fixes a msg_iter desync issue in the virtio vsock transport
that can lead to warnings and eventual -ENOMEM under specific failure
scenarios (e.g. partial GUP failure during MSG_ZEROCOPY transmission).

To fix this, we need to restore the msg_iter state on transmission failure.
However, since virtio vsock transport can be built as a module, we first
need to export iov_iter_restore.

Patch 1 exports iov_iter_restore.
Patch 2 implements the msg_iter restoration in virtio vsock.

Changes in v3:
- Use EXPORT_SYMBOL_GPL (Jens)

Changes in v2:
- Use iov_iter_savestate()/iov_iter_restore() (Stefano)
- Use a single restore point (Stefano)
- Reverse xmas tree (Stefano)
- Added comments in the code (Stefano)

v2: https://lore.kernel.org/all/20260613000953.467473-1-tavip@google.com/
v1: https://lore.kernel.org/all/20260609004809.1285028-1-tavip@google.com/

Octavian Purdila (2):
  iov_iter: export iov_iter_restore
  vsock/virtio: restore msg_iter on transmission failure

 lib/iov_iter.c                          |  1 +
 net/vmw_vsock/virtio_transport_common.c | 13 +++++++++++++
 2 files changed, 14 insertions(+)

-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply

* [PATCH net v3 1/2] iov_iter: export iov_iter_restore
From: Octavian Purdila @ 2026-06-22 22:27 UTC (permalink / raw)
  To: netdev
  Cc: Alexander Viro, Andrew Morton, Arseniy Krasnov, David S. Miller,
	Eric Dumazet, Eugenio Pérez, Jakub Kicinski, Jason Wang, kvm,
	linux-block, linux-fsdevel, linux-kernel, Michael S. Tsirkin,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Stefano Garzarella,
	virtualization, Xuan Zhuo, Jens Axboe, Octavian Purdila
In-Reply-To: <20260622222757.2130402-1-tavip@google.com>

Export iov_iter_restore so that it can be used by modules.

This is needed by the virtio vsock transport (which can be built as a
module) to restore the msg_iter state when transmission fails.

Acked-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Octavian Purdila <tavip@google.com>
---
 lib/iov_iter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 273919b161617..f5df63961fb24 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1491,6 +1491,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 		i->__iov -= state->nr_segs - i->nr_segs;
 	i->nr_segs = state->nr_segs;
 }
+EXPORT_SYMBOL_GPL(iov_iter_restore);
 
 /*
  * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH net v3 2/2] vsock/virtio: restore msg_iter on transmission failure
From: Octavian Purdila @ 2026-06-22 22:27 UTC (permalink / raw)
  To: netdev
  Cc: Alexander Viro, Andrew Morton, Arseniy Krasnov, David S. Miller,
	Eric Dumazet, Eugenio Pérez, Jakub Kicinski, Jason Wang, kvm,
	linux-block, linux-fsdevel, linux-kernel, Michael S. Tsirkin,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Stefano Garzarella,
	virtualization, Xuan Zhuo, Jens Axboe, Octavian Purdila,
	syzbot+28e5f3d207b14bae122a
In-Reply-To: <20260622222757.2130402-1-tavip@google.com>

When transmission fails in virtio_transport_send_pkt_info, the msg_iter
might have been partially advanced. If we don't restore it, the next
attempt to send data will use an incorrect iterator state, leading to
desync and warnings like "send_pkt() returns 0, but X expected".

Specifically, this can happen in the following scenario, triggered by
the syzkaller repro:

1. A write-only VMA (PROT_WRITE only) is partially populated by a
   prior TUN write that failed with -EIO but still faulted in some
   pages).
2. A vsock sendmmsg call with MSG_ZEROCOPY requests transmission of a
   buffer from this VMA.
3. The first packet (64KB) is sent successfully because the pages are
   populated.
4. The second packet allocation fails because GUP fast pins the first page
   but GUP slow fails on the next unpopulated page due to PROT_WRITE-only
   permissions.
5. The iterator is advanced by the partially successful GUP (68KB total
   advanced: 64KB from first packet + 4KB from second), but the send loop
   breaks and only reports 64KB sent. This creates a 4KB desync.
6. The next retry starts with a non-zero iov_offset, disabling zerocopy
   and falling back to copy mode.
7. In copy mode, the transmission succeeds for the next packets but
   exhausts the iterator early because of the desync.
8. The final retry sees an empty iterator but zerocopy is re-enabled
   (offset resets). It attempts to send the remaining bytes with zerocopy
   but pins 0 pages, creating an empty packet.
9. The transport sends the empty packet, triggering the warning because
   the returned bytes (header only) do not match the expected payload size.
10. The loop continues to spin, allocating ubuf_info each time, eventually
    exhausting sysctl_optmem_max and returning -ENOMEM to userspace.

Restore msg_iter to its original state before the packet allocation
and transmission attempt if they fail.

Fixes: e0718bd82e27 ("vsock: enable setting SO_ZEROCOPY")
Reported-by: syzbot+28e5f3d207b14bae122a@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=28e5f3d207b14bae122a
Assisted-by: gemini:gemini-3.1-pro
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Octavian Purdila <tavip@google.com>
---
 net/vmw_vsock/virtio_transport_common.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 09475007165b3..35fd4094d771d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -295,6 +295,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
 	u32 src_cid, src_port, dst_cid, dst_port;
 	const struct virtio_transport *t_ops;
+	struct iov_iter_state msg_iter_state;
 	struct virtio_vsock_sock *vvs;
 	struct ubuf_info *uarg = NULL;
 	u32 pkt_len = info->pkt_len;
@@ -368,8 +369,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 		struct sk_buff *skb;
 		size_t skb_len;
 
+		/* Save iterator state in case allocation or transmission fails
+		 * so we can restore it and retry.
+		 */
+		if (info->msg)
+			iov_iter_save_state(&info->msg->msg_iter, &msg_iter_state);
+
 		skb_len = min(max_skb_len, rest_len);
 
+		/* Note: virtio_transport_alloc_skb() can advance info->msg->msg_iter
+		 * even if it fails (e.g. partial GUP success).
+		 */
 		skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
 						 uarg,
 						 src_cid, src_port,
@@ -399,6 +409,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 			break;
 	} while (rest_len);
 
+	if (info->msg && ret < 0)
+		iov_iter_restore(&info->msg->msg_iter, &msg_iter_state);
+
 	virtio_transport_put_credit(vvs, rest_len);
 
 	/* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1.
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* Re: [PATCH] nbd: don't warn when reclassifying a busy socket lock
From: Hillf Danton @ 2026-06-23  0:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Deepanshu Kartikey, linux-block, nbd, linux-kernel,
	syzbot+6b85d1e39a5b8ed9a954
In-Reply-To: <CANn89iJBomCNpwzOiYHmmPf0i3KQGaqoiKh6VFeM6NHOQaCn3Q@mail.gmail.com>

On Mon, 22 Jun 2026 01:18:10 -0700 Eric Dumazet wrote:
>On Sun, Jun 21, 2026 at 6:43 PM Hillf Danton <hdanton@sina.com> wrote:
>> On Mon, 22 Jun 2026 05:22:55 +0530 Deepanshu Kartikey wrote:
>> > nbd_reclassify_socket() warns via WARN_ON_ONCE() if the socket lock is
>> > held at the point of reclassification. That assertion was copied from
>> > nvme-tcp, where the socket is created internally by the kernel
>> > (sock_create_kern()) and is never visible to user space, so the lock
>> > is guaranteed to be free.
>> >
>> > NBD is different: the socket is looked up from a user-supplied fd in
>> > nbd_get_socket(), and user space retains that fd. A concurrent syscall
>> > on the same socket (or softirq processing taking bh_lock_sock() on a
>> > connected TCP socket) can legitimately hold the lock at the instant
>> > NBD reclassifies it. sock_allow_reclassification() then returns false
>> > and the WARN_ON_ONCE() fires, which turns into a crash under
>> > panic_on_warn. This is reachable by simply racing NBD_CMD_CONNECT
>> > against socket activity on the same fd, as reported by syzbot.
>> >
>> Given the syzbot report, if you are right (I suspect) then Eric delivered
>> another half-baked croissant, and feel free to cut it off instead to make
>> room for correct fix.
>
> Nobody (including you) caught this.difference between nbd and other
> sock_allow_reclassification() callers.
> 
Nope, actually it raises the question -- does the deadlock still remain
after your fix without the lock key you added applied?

> What was the "correct fix" you envisioned exactly?
>
Frankly I had no evidence against your fix a couple days back, but now I
see your lock key approach fails to take off. And the correct fix is to
erase the incorrect locking order ffa1e7ada456 tries to catch, more
difficult than you thought so far.

^ permalink raw reply

* Re: [PATCH] nbd: don't warn when reclassifying a busy socket lock
From: Eric Dumazet @ 2026-06-23  0:21 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Deepanshu Kartikey, linux-block, nbd, linux-kernel,
	syzbot+6b85d1e39a5b8ed9a954
In-Reply-To: <20260623000723.135-1-hdanton@sina.com>

On Mon, Jun 22, 2026 at 5:07 PM Hillf Danton <hdanton@sina.com> wrote:
>
> On Mon, 22 Jun 2026 01:18:10 -0700 Eric Dumazet wrote:
> >On Sun, Jun 21, 2026 at 6:43 PM Hillf Danton <hdanton@sina.com> wrote:
> >> On Mon, 22 Jun 2026 05:22:55 +0530 Deepanshu Kartikey wrote:
> >> > nbd_reclassify_socket() warns via WARN_ON_ONCE() if the socket lock is
> >> > held at the point of reclassification. That assertion was copied from
> >> > nvme-tcp, where the socket is created internally by the kernel
> >> > (sock_create_kern()) and is never visible to user space, so the lock
> >> > is guaranteed to be free.
> >> >
> >> > NBD is different: the socket is looked up from a user-supplied fd in
> >> > nbd_get_socket(), and user space retains that fd. A concurrent syscall
> >> > on the same socket (or softirq processing taking bh_lock_sock() on a
> >> > connected TCP socket) can legitimately hold the lock at the instant
> >> > NBD reclassifies it. sock_allow_reclassification() then returns false
> >> > and the WARN_ON_ONCE() fires, which turns into a crash under
> >> > panic_on_warn. This is reachable by simply racing NBD_CMD_CONNECT
> >> > against socket activity on the same fd, as reported by syzbot.
> >> >
> >> Given the syzbot report, if you are right (I suspect) then Eric delivered
> >> another half-baked croissant, and feel free to cut it off instead to make
> >> room for correct fix.
> >
> > Nobody (including you) caught this.difference between nbd and other
> > sock_allow_reclassification() callers.
> >
> Nope, actually it raises the question -- does the deadlock still remain
> after your fix without the lock key you added applied?

LOCKDEP might have a false positive, but it will be much much harder to trigger.

I had about 50 syzbot duplicates (that I did not release) before d532cddb6c60
 ("nbd: Reclassify sockets to avoid lockdep circular dependency").

>
> > What was the "correct fix" you envisioned exactly?
> >
> Frankly I had no evidence against your fix a couple days back, but now I
> see your lock key approach fails to take off. And the correct fix is to
> erase the incorrect locking order ffa1e7ada456 tries to catch, more
> difficult than you thought so far.

Which incorrect locking order are you referring to? This is a LOCKDEP
false positive.

I suggest you send a patch so we can discuss it.

^ permalink raw reply

* Re: [PATCH] nbd: don't warn when reclassifying a busy socket lock
From: Hillf Danton @ 2026-06-23  0:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Deepanshu Kartikey, linux-block, nbd, linux-kernel,
	syzbot+6b85d1e39a5b8ed9a954
In-Reply-To: <CANn89iLPWCo_u-8jCsDM6jjZYfESvtUt9n3xD7yuAyNNntSw6w@mail.gmail.com>

On Mon, 22 Jun 2026 17:21:53 -0700 Eric Dumazet wrote:
>On Mon, Jun 22, 2026 at 5:07 PM Hillf Danton <hdanton@sina.com> wrote:
>> On Mon, 22 Jun 2026 01:18:10 -0700 Eric Dumazet wrote:
>> >On Sun, Jun 21, 2026 at 6:43 PM Hillf Danton <hdanton@sina.com> wrote:
>> >> On Mon, 22 Jun 2026 05:22:55 +0530 Deepanshu Kartikey wrote:
>> >> > nbd_reclassify_socket() warns via WARN_ON_ONCE() if the socket lock is
>> >> > held at the point of reclassification. That assertion was copied from
>> >> > nvme-tcp, where the socket is created internally by the kernel
>> >> > (sock_create_kern()) and is never visible to user space, so the lock
>> >> > is guaranteed to be free.
>> >> >
>> >> > NBD is different: the socket is looked up from a user-supplied fd in
>> >> > nbd_get_socket(), and user space retains that fd. A concurrent syscall
>> >> > on the same socket (or softirq processing taking bh_lock_sock() on a
>> >> > connected TCP socket) can legitimately hold the lock at the instant
>> >> > NBD reclassifies it. sock_allow_reclassification() then returns false
>> >> > and the WARN_ON_ONCE() fires, which turns into a crash under
>> >> > panic_on_warn. This is reachable by simply racing NBD_CMD_CONNECT
>> >> > against socket activity on the same fd, as reported by syzbot.
>> >> >
>> >> Given the syzbot report, if you are right (I suspect) then Eric delivered
>> >> another half-baked croissant, and feel free to cut it off instead to make
>> >> room for correct fix.
>> >
>> > Nobody (including you) caught this.difference between nbd and other
>> > sock_allow_reclassification() callers.
>> >
>> Nope, actually it raises the question -- does the deadlock still remain
>> after your fix without the lock key you added applied?
>
>LOCKDEP might have a false positive, but it will be much much harder to trigger.
>
>I had about 50 syzbot duplicates (that I did not release) before d532cddb6c60
> ("nbd: Reclassify sockets to avoid lockdep circular dependency").
>
>>
>> > What was the "correct fix" you envisioned exactly?
>> >
>> Frankly I had no evidence against your fix a couple days back, but now I
>> see your lock key approach fails to take off. And the correct fix is to
>> erase the incorrect locking order ffa1e7ada456 tries to catch, more
>> difficult than you thought so far.
>
>Which incorrect locking order are you referring to? This is a LOCKDEP
>false positive.
>
In addition to 50 syzbot reports, your fix has a Fixes tag, no?

>I suggest you send a patch so we can discuss it.

The deadlock existed before ffa1e7ada456, why is a chance left for your fix?

^ permalink raw reply

* Re: [PATCH 1/2] blk-cgroup: fix blkg leak in blkg_create() error path
From: Tao Cui @ 2026-06-23  1:16 UTC (permalink / raw)
  To: Zizhi Wo, axboe, tj, josef, linux-block
  Cc: cui.tao, cgroups, yangerkun, chengzhihao1, houtao1, yukuai
In-Reply-To: <20260622070714.1158886-2-wozizhi@huaweicloud.com>

Hi Zizhi,

Thanks for the patch.  I ran into the same issue and posted a fix for it
earlier:

  https://lore.kernel.org/all/20260507061229.57466-1-cuitao@kylinos.cn/

The leak fix is identical to yours (blkg_put() -> percpu_ref_kill()),
plus one extra change: moving blkg->online = true into the success
block:

	if (likely(!ret)) {
		...
+		blkg->online = true;
	}
-	blkg->online = true;

On the failure path the blkg was never inserted into any list, and its
blkg->pd[i]->online flags were not set either (those are in the same
block).  Leaving blkg->online = true marks a blkg as online that was
never created -- inconsistent with pd[]->online and with
blkg_destroy(), which clears blkg->online = false.  Not observable
today, since the failed blkg is on no list and unreachable by the
online readers, but the flag should track the actual insertion.

(This was sent to the cgroups list rather than linux-block, hence the
overlap.)

Thanks,
Tao

在 2026/6/22 15:07, Zizhi Wo 写道:
> When radix_tree_insert() fails in blkg_create(), the error path calls
> blkg_put() to release the blkg. This was correct when blkg->refcnt was an
> atomic_t: blkg_put() dropped it to 0 and triggered the release path.
> 
> But commit 7fcf2b033b84 ("blkcg: change blkg reference counting to use
> percpu_ref") switched refcnt to a percpu_ref. In percpu mode
> percpu_ref_put() never checks for zero, so the release callback is never
> invoked. This blkg is on neither blkcg->blkg_list nor queue->blkg_list, so
> blkg_destroy_all() / blkcg_destroy_blkgs() can never reach it to call
> blkg_destroy()->percpu_ref_kill() either, cause the leak.
> 
> Fix it by killing the percpu_ref instead, which switches it to atomic mode
> and drops the initial ref.
> 
> Fixes: 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref")
> Signed-off-by: Zizhi Wo <wozizhi@huaweicloud.com>
> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
> ---
>  block/blk-cgroup.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index bc63bd220865..6386fe413994 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -437,11 +437,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>  
>  	if (!ret)
>  		return blkg;
>  
>  	/* @blkg failed fully initialized, use the usual release path */
> -	blkg_put(blkg);
> +	percpu_ref_kill(&blkg->refcnt);
>  	return ERR_PTR(ret);
>  
>  err_put_css:
>  	css_put(&blkcg->css);
>  err_free_blkg:


^ permalink raw reply

* [PATCH v2] block: serialize elevator changes for the same queue using a writer lock
From: Shin'ichiro Kawasaki @ 2026-06-23  1:32 UTC (permalink / raw)
  To: linux-block, Jens Axboe; +Cc: Ming Lei, Nilay Shroff, Shin'ichiro Kawasaki

When elevator_change() is called concurrently for the same queue, the
elevator_change_done() function runs concurrently as well. This function
adds or deletes kobjects for the debugfs entry of the queue. Then the
concurrent calls cause memory corruption of the kobjects and result in a
process hang. The core part of the elevator switch is protected by queue
freeze and q->elevator_lock. However, since the commit 559dc11143eb
("block: move elv_register[unregister]_queue out of elevator_lock"), the
elevator_change_done() is not serialized. Hence the memory corruption
and the hang.

The failures are observed when udev-worker writes to a sysfs
queue/scheduler attribute file while the blktests test case block/005
writes to the same attribute file. The failure also can be recreated by
running two processes that write to the same queue/scheduler file
concurrently. The failure is observed since another commit 370ac285f23a
("block: avoid cpu_hotplug_lock depedency on freeze_lock"). This commit
changed the behavior of queue freeze and it unveiled the failure.

Fix the failure by changing elv_iosched_store() to acquire
update_nr_hwq_lock as the writer lock instead of the reader lock. This
serializes the whole elevator switch steps, including the
elevator_change_done() call.

Fixes: 559dc11143eb ("block: move elv_register[unregister]_queue out of elevator_lock")
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
---
I observed that the blktests test case block/005 hung on a specific
server hardware using a specific HDD as a block device. During the test
case run, the kernel reported KASAN null-ptr-deref and slab-use-after-
free errors. The failure happened when a sysfs queue/scheduler attribute
file is written concurrently. I reported the failure and shared a
candidate fix patch as RFC [1]. Based on the comments and discussion on
the RFC patch, I propose this v2 patch that avoids introducing a new
lock. My thanks go to Ming and Nilay for the discussion.

Please refer to [1] for details of the failure. Also, I created a
blktests test case that recreates the hang [2], which I used to test the
fix.

* Changes from RFC v1
- Instead of adding a new mutex to struct request_queue, replace the
  reader lock on update_nr_hwq_lock with the writer lock in
  elv_iosched_store().

[1] https://lore.kernel.org/linux-block/20260611074200.474676-1-shinichiro.kawasaki@wdc.com/
[2] https://github.com/kawasaki/blktests/commit/8e80b3ccc0bbbe3f209d00eacd138d020de97fc6

 block/elevator.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 3bcd37c2aa34..b03185a217ff 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -813,7 +813,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 	 *   update_nr_hwq_lock -> kn->active (via del_gendisk -> kobject_del)
 	 *   kn->active -> update_nr_hwq_lock (via this sysfs write path)
 	 */
-	if (!down_read_trylock(&set->update_nr_hwq_lock)) {
+	if (!down_write_trylock(&set->update_nr_hwq_lock)) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -824,7 +824,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 	} else {
 		ret = -ENOENT;
 	}
-	up_read(&set->update_nr_hwq_lock);
+	up_write(&set->update_nr_hwq_lock);
 
 out:
 	if (ctx.type)
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH] block/cgroup: Drop stale -EBUSY retry from blkg_conf_prep()
From: Tao Cui @ 2026-06-23  1:33 UTC (permalink / raw)
  To: Yang Xiuwei, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: cui.tao, cgroups, linux-block
In-Reply-To: <20260622085623.520209-1-yangxiuwei@kylinos.cn>



在 2026/6/22 16:56, Yang Xiuwei 写道:
> Since commit 8f4236d9008b ("block: remove QUEUE_FLAG_BYPASS and
> ->bypass") nothing in the blkcg blkg lookup/creation path
> returns -EBUSY anymore...

Correct. I traced every error path in blkg_conf_prep() (and blkg_create()
underneath it): the only possible values are -EINVAL, -EOPNOTSUPP, -ENOMEM,
-ENODEV and -EEXIST (from radix_tree_insert). The -EBUSY source was indeed
the blk_queue_bypass() check removed by 8f4236d9008b, so the retry branch
has been dead since 2018. Clean removal with no behavioral change.

Reviewed-by: Tao Cui <cuitao@kylinos.cn>

^ permalink raw reply

* Re: [PATCH 1/2] blk-cgroup: fix blkg leak in blkg_create() error path
From: Zizhi Wo @ 2026-06-23  1:38 UTC (permalink / raw)
  To: Tao Cui, axboe, tj, josef, linux-block
  Cc: cgroups, yangerkun, chengzhihao1, houtao1, yukuai
In-Reply-To: <38704548-786f-4ec7-afd4-228aa8d68ad7@linux.dev>



在 2026/6/23 9:16, Tao Cui 写道:
> Hi Zizhi,
> 
> Thanks for the patch.  I ran into the same issue and posted a fix for it
> earlier:
> 
>    https://lore.kernel.org/all/20260507061229.57466-1-cuitao@kylinos.cn/
> 
> The leak fix is identical to yours (blkg_put() -> percpu_ref_kill()),
> plus one extra change: moving blkg->online = true into the success
> block:
> 
> 	if (likely(!ret)) {
> 		...
> +		blkg->online = true;
> 	}
> -	blkg->online = true;
> 
> On the failure path the blkg was never inserted into any list, and its
> blkg->pd[i]->online flags were not set either (those are in the same
> block).  Leaving blkg->online = true marks a blkg as online that was
> never created -- inconsistent with pd[]->online and with
> blkg_destroy(), which clears blkg->online = false.  Not observable
> today, since the failed blkg is on no list and unreachable by the
> online readers, but the flag should track the actual insertion.
> 
> (This was sent to the cgroups list rather than linux-block, hence the
> overlap.)
> 
> Thanks,
> Tao

I'm not subscribed to the cgroup mailing list, so I didn't see that this
issue had already been fixed. :( And indeed, your patch nicely updates
blkg->online as well. — I hadn't realized that.

Thanks for the heads-up!

Thanks,
Zizhi Wo

> 
> 在 2026/6/22 15:07, Zizhi Wo 写道:
>> When radix_tree_insert() fails in blkg_create(), the error path calls
>> blkg_put() to release the blkg. This was correct when blkg->refcnt was an
>> atomic_t: blkg_put() dropped it to 0 and triggered the release path.
>>
>> But commit 7fcf2b033b84 ("blkcg: change blkg reference counting to use
>> percpu_ref") switched refcnt to a percpu_ref. In percpu mode
>> percpu_ref_put() never checks for zero, so the release callback is never
>> invoked. This blkg is on neither blkcg->blkg_list nor queue->blkg_list, so
>> blkg_destroy_all() / blkcg_destroy_blkgs() can never reach it to call
>> blkg_destroy()->percpu_ref_kill() either, cause the leak.
>>
>> Fix it by killing the percpu_ref instead, which switches it to atomic mode
>> and drops the initial ref.
>>
>> Fixes: 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref")
>> Signed-off-by: Zizhi Wo <wozizhi@huaweicloud.com>
>> Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
>> ---
>>   block/blk-cgroup.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index bc63bd220865..6386fe413994 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -437,11 +437,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
>>   
>>   	if (!ret)
>>   		return blkg;
>>   
>>   	/* @blkg failed fully initialized, use the usual release path */
>> -	blkg_put(blkg);
>> +	percpu_ref_kill(&blkg->refcnt);
>>   	return ERR_PTR(ret);
>>   
>>   err_put_css:
>>   	css_put(&blkcg->css);
>>   err_free_blkg:


^ permalink raw reply

* [PATCH blktests v2 1/2] src/miniublk: switch to ioctl-encoded ublk commands
From: Sebastian Chlad @ 2026-06-23  3:27 UTC (permalink / raw)
  To: linux-block; +Cc: shinichiro.kawasaki, Sebastian Chlad

Kernels built without CONFIG_BLKDEV_UBLK_LEGACY_OPCODES reject the
legacy raw UBLK_CMD_* and UBLK_IO_* opcodes. Switch miniublk to use
the ioctl-encoded UBLK_U_CMD_* and UBLK_U_IO_* variants defined in
linux/ublk_cmd.h instead.

For IO commands, the ioctl-encoded opcode is used for submission while
_IOC_NR() extracts the raw NR bits for build_user_data(), keeping the
user_data tag encoding intact.

Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
Co-authored-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
---
changes in v2:
1. Makefile as prepared by Shin'ichiro Kawasaki
2. restored else if to check if (io->flags & UBLKSRV_NEED_FETCH_RQ)

 src/Makefile   |  9 +++++++++
 src/miniublk.c | 28 ++++++++++++++--------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index d8833bf..adfe3ef 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,6 +8,10 @@ HAVE_C_MACRO = $(shell if echo "$(H)include <$(1)>" |	\
 		$(CC) $(CFLAGS) -E - 2>&1 /dev/null | grep $(2) > /dev/null 2>&1; \
 		then echo 1;else echo 0; fi)
 
+HAVE_C_DEF = $(shell if echo -e "$(H)include <$(1)>\n#ifdef $(2)\nHAVE_$(2)\n#endif" | \
+		$(CC) $(CFLAGS) -E - 2>&1 /dev/null | grep HAVE_$(2) > /dev/null 2>&1; \
+		then echo 1;else echo 0; fi)
+
 C_TARGETS := \
 	dio-offsets \
 	loblksize \
@@ -27,6 +31,7 @@ C_UBLK_TARGETS := miniublk
 
 HAVE_LIBURING := $(call HAVE_C_MACRO,liburing.h,IORING_OP_URING_CMD)
 HAVE_UBLK_HEADER := $(call HAVE_C_HEADER,linux/ublk_cmd.h,1)
+HAVE_NEW_UBLK_INTF := $(call HAVE_C_DEF,linux/ublk_cmd.h,UBLK_U_CMD_START_DEV)
 
 CXX_TARGETS := \
 	discontiguous-io
@@ -37,8 +42,12 @@ SYZKALLER_TARGETS := \
 TARGETS := $(C_TARGETS) $(CXX_TARGETS) $(SYZKALLER_TARGETS)
 
 ifeq ($(HAVE_UBLK_HEADER), 1)
+ifeq ($(HAVE_NEW_UBLK_INTF), 1)
 C_URING_TARGETS += $(C_UBLK_TARGETS)
 else
+$(info Skip $(C_UBLK_TARGETS) build due to missing new ublk interface(v6.4+))
+endif
+else
 $(info Skip $(C_UBLK_TARGETS) build due to missing kernel header(v6.0+))
 endif
 
diff --git a/src/miniublk.c b/src/miniublk.c
index f98f850..628207a 100644
--- a/src/miniublk.c
+++ b/src/miniublk.c
@@ -294,7 +294,7 @@ static int __ublk_ctrl_cmd(struct ublk_dev *dev,
 int ublk_ctrl_stop_dev(struct ublk_dev *dev)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_STOP_DEV,
+		.cmd_op	= UBLK_U_CMD_STOP_DEV,
 	};
 
 	return __ublk_ctrl_cmd(dev, &data);
@@ -304,7 +304,7 @@ int ublk_ctrl_start_dev(struct ublk_dev *dev,
 		int daemon_pid)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_START_DEV,
+		.cmd_op	= UBLK_U_CMD_START_DEV,
 		.flags	= CTRL_CMD_HAS_DATA,
 	};
 
@@ -316,7 +316,7 @@ int ublk_ctrl_start_dev(struct ublk_dev *dev,
 int ublk_ctrl_add_dev(struct ublk_dev *dev)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_ADD_DEV,
+		.cmd_op	= UBLK_U_CMD_ADD_DEV,
 		.flags	= CTRL_CMD_HAS_BUF,
 		.addr = (__u64)&dev->dev_info,
 		.len = sizeof(struct ublksrv_ctrl_dev_info),
@@ -328,7 +328,7 @@ int ublk_ctrl_add_dev(struct ublk_dev *dev)
 int ublk_ctrl_del_dev(struct ublk_dev *dev)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op = UBLK_CMD_DEL_DEV,
+		.cmd_op = UBLK_U_CMD_DEL_DEV,
 		.flags = 0,
 	};
 
@@ -338,7 +338,7 @@ int ublk_ctrl_del_dev(struct ublk_dev *dev)
 int ublk_ctrl_get_info(struct ublk_dev *dev)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_GET_DEV_INFO,
+		.cmd_op	= UBLK_U_CMD_GET_DEV_INFO,
 		.flags	= CTRL_CMD_HAS_BUF,
 		.addr = (__u64)&dev->dev_info,
 		.len = sizeof(struct ublksrv_ctrl_dev_info),
@@ -351,7 +351,7 @@ int ublk_ctrl_set_params(struct ublk_dev *dev,
 		struct ublk_params *params)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_SET_PARAMS,
+		.cmd_op	= UBLK_U_CMD_SET_PARAMS,
 		.flags	= CTRL_CMD_HAS_BUF,
 		.addr = (__u64)params,
 		.len = sizeof(*params),
@@ -364,7 +364,7 @@ static int ublk_ctrl_get_params(struct ublk_dev *dev,
 		struct ublk_params *params)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_GET_PARAMS,
+		.cmd_op	= UBLK_U_CMD_GET_PARAMS,
 		.flags	= CTRL_CMD_HAS_BUF,
 		.addr = (__u64)params,
 		.len = sizeof(*params),
@@ -378,7 +378,7 @@ static int ublk_ctrl_get_params(struct ublk_dev *dev,
 static int ublk_ctrl_start_user_recover(struct ublk_dev *dev)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_START_USER_RECOVERY,
+		.cmd_op	= UBLK_U_CMD_START_USER_RECOVERY,
 		.flags	= 0,
 	};
 
@@ -389,7 +389,7 @@ static int ublk_ctrl_end_user_recover(struct ublk_dev *dev,
 		int daemon_pid)
 {
 	struct ublk_ctrl_cmd_data data = {
-		.cmd_op	= UBLK_CMD_END_USER_RECOVERY,
+		.cmd_op	= UBLK_U_CMD_END_USER_RECOVERY,
 		.flags	= CTRL_CMD_HAS_DATA,
 	};
 
@@ -624,9 +624,9 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
 		return 0;
 
 	if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
-		cmd_op = UBLK_IO_COMMIT_AND_FETCH_REQ;
+		cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
-		cmd_op = UBLK_IO_FETCH_REQ;
+		cmd_op = UBLK_U_IO_FETCH_REQ;
 
 	sqe = io_uring_get_sqe(&q->ring);
 	if (!sqe) {
@@ -637,7 +637,7 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
 
 	cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe);
 
-	if (cmd_op == UBLK_IO_COMMIT_AND_FETCH_REQ)
+	if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
 		cmd->result = io->result;
 
 	/* These fields should be written once, never change */
@@ -650,7 +650,7 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
 	cmd->addr	= (__u64)io->buf_addr;
 	cmd->q_id	= q->q_id;
 
-	user_data = build_user_data(tag, cmd_op, 0, 0);
+	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
 	io_uring_sqe_set_data64(sqe, user_data);
 
 	io->flags = 0;
@@ -658,7 +658,7 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
 	q->cmd_inflight += 1;
 
 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
-			__func__, q->q_id, tag, cmd_op,
+			__func__, q->q_id, tag, _IOC_NR(cmd_op),
 			io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
 	return 1;
 }
-- 
2.51.0


^ permalink raw reply related

* [PATCH blktests v2 2/2] src/miniublk: fall back to legacy opcodes on older kernels
From: Sebastian Chlad @ 2026-06-23  3:27 UTC (permalink / raw)
  To: linux-block; +Cc: shinichiro.kawasaki, Sebastian Chlad
In-Reply-To: <20260623032707.14439-1-sebastian.chlad@suse.com>

Try ioctl-encoded ADD_DEV and GET_DEV_INFO first; if either fails,
retry with the legacy raw opcode. After a successful bootstrap
command, derive use_ioctl from UBLK_F_CMD_IOCTL_ENCODE in dev_info.flags
so all subsequent control and IO commands use the mode reported by the
kernel.

Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
---
 src/miniublk.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/src/miniublk.c b/src/miniublk.c
index 628207a..b0c308b 100644
--- a/src/miniublk.c
+++ b/src/miniublk.c
@@ -112,6 +112,7 @@ struct ublk_dev {
 	int fds[2];	/* fds[0] points to /dev/ublkcN */
 	int nr_fds;
 	int ctrl_fd;
+	bool use_ioctl;
 	struct io_uring ring;
 };
 
@@ -235,7 +236,7 @@ static inline int ublk_setup_ring(struct io_uring *r, int depth,
 
 static inline void ublk_ctrl_init_cmd(struct ublk_dev *dev,
 		struct io_uring_sqe *sqe,
-		struct ublk_ctrl_cmd_data *data)
+		struct ublk_ctrl_cmd_data *data, __u32 cmd_op)
 {
 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
 	struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
@@ -255,25 +256,34 @@ static inline void ublk_ctrl_init_cmd(struct ublk_dev *dev,
 	cmd->dev_id = info->dev_id;
 	cmd->queue_id = -1;
 
-	ublk_set_sqe_cmd_op(sqe, data->cmd_op);
+	ublk_set_sqe_cmd_op(sqe, cmd_op);
 
 	io_uring_sqe_set_data(sqe, cmd);
 }
 
+static void ublk_update_ioctl_encoding(struct ublk_dev *dev)
+{
+	dev->use_ioctl = !!(dev->dev_info.flags & UBLK_F_CMD_IOCTL_ENCODE);
+}
+
 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
 		struct ublk_ctrl_cmd_data *data)
 {
 	struct io_uring_sqe *sqe;
 	struct io_uring_cqe *cqe;
+	__u32 cmd_op = data->cmd_op;
 	int ret = -EINVAL;
 
+	if (!dev->use_ioctl)
+		cmd_op = _IOC_NR(cmd_op);
+
 	sqe = io_uring_get_sqe(&dev->ring);
 	if (!sqe) {
 		ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
 		return ret;
 	}
 
-	ublk_ctrl_init_cmd(dev, sqe, data);
+	ublk_ctrl_init_cmd(dev, sqe, data, cmd_op);
 
 	ret = io_uring_submit(&dev->ring);
 	if (ret < 0) {
@@ -321,8 +331,19 @@ int ublk_ctrl_add_dev(struct ublk_dev *dev)
 		.addr = (__u64)&dev->dev_info,
 		.len = sizeof(struct ublksrv_ctrl_dev_info),
 	};
+	int ret;
 
-	return __ublk_ctrl_cmd(dev, &data);
+	ret = __ublk_ctrl_cmd(dev, &data);
+	if (ret < 0) {
+		/* retry with legacy opcode on older kernels */
+		dev->use_ioctl = false;
+		ret = __ublk_ctrl_cmd(dev, &data);
+	}
+
+	if (ret >= 0)
+		ublk_update_ioctl_encoding(dev);
+
+	return ret;
 }
 
 int ublk_ctrl_del_dev(struct ublk_dev *dev)
@@ -343,8 +364,19 @@ int ublk_ctrl_get_info(struct ublk_dev *dev)
 		.addr = (__u64)&dev->dev_info,
 		.len = sizeof(struct ublksrv_ctrl_dev_info),
 	};
+	int ret;
 
-	return __ublk_ctrl_cmd(dev, &data);
+	ret = __ublk_ctrl_cmd(dev, &data);
+	if (ret < 0 && dev->use_ioctl) {
+		/* retry with legacy opcode on older kernels */
+		dev->use_ioctl = false;
+		ret = __ublk_ctrl_cmd(dev, &data);
+	}
+
+	if (ret >= 0)
+		ublk_update_ioctl_encoding(dev);
+
+	return ret;
 }
 
 int ublk_ctrl_set_params(struct ublk_dev *dev,
@@ -453,6 +485,8 @@ static struct ublk_dev *ublk_ctrl_init()
 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
 	int ret;
 
+	dev->use_ioctl = true; /* use ioctl opcodes by default */
+
 	dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
 	if (dev->ctrl_fd < 0) {
 		ublk_err("control dev %s can't be opened: %m %d\n", CTRL_DEV, errno);
@@ -628,6 +662,9 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
 		cmd_op = UBLK_U_IO_FETCH_REQ;
 
+	if (!q->dev->use_ioctl)
+		cmd_op = _IOC_NR(cmd_op);
+
 	sqe = io_uring_get_sqe(&q->ring);
 	if (!sqe) {
 		ublk_err("%s: run out of sqe %d, tag %d\n",
-- 
2.51.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox