* [PATCH v3 3/5] loop: set dma_alignment from the backing file for direct I/O
2026-06-24 17:09 [PATCH v3 0/5] block: validate direct I/O memory alignment Keith Busch
2026-06-24 17:09 ` [PATCH v3 1/5] block: use blkdev_iov_iter_get_pages status for errors Keith Busch
2026-06-24 17:09 ` [PATCH v3 2/5] block: fix dio leak on metadata mapping error Keith Busch
@ 2026-06-24 17:09 ` Keith Busch
2026-06-24 17:09 ` [PATCH v3 4/5] zloop: set dma_alignment from the backing files " Keith Busch
2026-06-24 17:09 ` [PATCH v3 5/5] block: validate user space vectors during extraction Keith Busch
4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2026-06-24 17:09 UTC (permalink / raw)
To: linux-block, linux-fsdevel
Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Direct I/O user pages are forwarded to the backing file unchanged, so
the backing's DMA alignment requirement applies to them. Track the
backing file's dio_mem_align and advertise it as the loop device's
dma_alignment if it is larger than the default so we advertise proper
limits and misaligned I/O is rejected early instead of being dispatched
to the backend.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/block/loop.c | 46 ++++++++++++++++++++++++++++++++++++--------
1 file changed, 38 insertions(+), 8 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 310de0463beb1..5fe61d542f8b7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -54,6 +54,7 @@ struct loop_device {
struct file *lo_backing_file;
unsigned int lo_min_dio_size;
+ unsigned int lo_dio_mem_align;
struct block_device *lo_device;
gfp_t old_gfp_mask;
@@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo)
__func__, lo->lo_number, lo->lo_file_name, rc);
}
-static unsigned int loop_query_min_dio_size(struct loop_device *lo)
+static void loop_update_dio_alignment(struct loop_device *lo)
{
struct file *file = lo->lo_backing_file;
struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
struct kstat st;
/*
- * Use the minimal dio alignment of the file system if provided.
+ * Use the dio alignment of the file system if provided. The incomoing
+ * request's bio_vec is forwarded to the backing file unchanged, so its
+ * required memory alignment becomes the device's dma_alignment when
+ * used for direct-io.
*/
if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
- (st.result_mask & STATX_DIOALIGN))
- return st.dio_offset_align;
+ (st.result_mask & STATX_DIOALIGN)) {
+ lo->lo_min_dio_size = st.dio_offset_align;
+ lo->lo_dio_mem_align = st.dio_mem_align - 1;
+ return;
+ }
/*
* In a perfect world this wouldn't be needed, but as of Linux 6.13 only
* a handful of file systems support the STATX_DIOALIGN flag.
*/
- if (sb_bdev)
- return bdev_logical_block_size(sb_bdev);
- return SECTOR_SIZE;
+ if (sb_bdev) {
+ lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev);
+ lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev);
+ return;
+ }
+
+ lo->lo_min_dio_size = SECTOR_SIZE;
+ lo->lo_dio_mem_align = SECTOR_SIZE - 1;
}
static inline int is_loop_device(struct file *file)
@@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
if (lo->lo_backing_file->f_flags & O_DIRECT)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
- lo->lo_min_dio_size = loop_query_min_dio_size(lo);
+ loop_update_dio_alignment(lo);
}
static int loop_check_backing_file(struct file *file)
@@ -940,6 +952,19 @@ static unsigned int loop_default_blocksize(struct loop_device *lo)
return SECTOR_SIZE;
}
+static void loop_set_dma_limit(struct loop_device *lo, struct queue_limits *lim)
+{
+ /*
+ * Direct I/O forwards the user pages to the backing file unchanged, so
+ * track the backing's DMA alignment requirement as the mode is toggled.
+ */
+ if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+ lim->dma_alignment = max_t(unsigned int, lo->lo_dio_mem_align,
+ SECTOR_SIZE - 1);
+ else
+ lim->dma_alignment = SECTOR_SIZE - 1;
+}
+
static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
unsigned int bsize)
{
@@ -961,6 +986,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
lim->logical_block_size = bsize;
lim->physical_block_size = bsize;
lim->io_min = bsize;
+ loop_set_dma_limit(lo, lim);
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
lim->features |= BLK_FEAT_WRITE_CACHE;
@@ -1416,6 +1442,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
{
bool use_dio = !!arg;
unsigned int memflags;
+ struct queue_limits lim;
if (lo->lo_state != Lo_bound)
return -ENXIO;
@@ -1434,6 +1461,9 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
else
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+ lim = queue_limits_start_update(lo->lo_queue);
+ loop_set_dma_limit(lo, &lim);
+ queue_limits_commit_update(lo->lo_queue, &lim);
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
return 0;
}
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH v3 4/5] zloop: set dma_alignment from the backing files for direct I/O
2026-06-24 17:09 [PATCH v3 0/5] block: validate direct I/O memory alignment Keith Busch
` (2 preceding siblings ...)
2026-06-24 17:09 ` [PATCH v3 3/5] loop: set dma_alignment from the backing file for direct I/O Keith Busch
@ 2026-06-24 17:09 ` Keith Busch
2026-06-24 17:09 ` [PATCH v3 5/5] block: validate user space vectors during extraction Keith Busch
4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2026-06-24 17:09 UTC (permalink / raw)
To: linux-block, linux-fsdevel
Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Direct I/O user pages are forwarded to the backing files unchanged, so
the backing's DMA alignment requirement applies to them. Track the
backing file's dio_mem_align and advertise it as the zloop device's
dma_alignment if it is larger than the default so we advertise proper
limits and misaligned I/O is rejected early instead of being dispatched
to the backend.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/block/zloop.c | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index 55eeb6aac0ea3..f97a20cfdb7ce 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -144,6 +144,7 @@ struct zloop_device {
unsigned int nr_conv_zones;
unsigned int max_open_zones;
unsigned int block_size;
+ unsigned int dio_mem_align;
spinlock_t open_zones_lock;
struct list_head open_zones_lru_list;
@@ -1037,20 +1038,30 @@ static int zloop_get_block_size(struct zloop_device *zlo,
struct kstat st;
/*
- * If the FS block size is lower than or equal to 4K, use that as the
- * device block size. Otherwise, fallback to the FS direct IO alignment
- * constraint if that is provided, and to the FS underlying device
- * physical block size if the direct IO alignment is unknown.
+ * Use the dio alignment of the file system if provided. The incoming
+ * request's bio_vec is forwarded to the backing file unchanged, so its
+ * required memory alignment becomes the device's dma_alignment when
+ * used for direct-io.
*/
- if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
- zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
- else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
- (st.result_mask & STATX_DIOALIGN))
+ if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN)) {
zlo->block_size = st.dio_offset_align;
- else if (sb_bdev)
+ zlo->dio_mem_align = st.dio_mem_align - 1;
+ } else if (sb_bdev) {
zlo->block_size = bdev_physical_block_size(sb_bdev);
- else
+ zlo->dio_mem_align = bdev_dma_alignment(sb_bdev);
+ } else {
zlo->block_size = SECTOR_SIZE;
+ zlo->dio_mem_align = SECTOR_SIZE - 1;
+ }
+
+ /*
+ * Prefer the FS block size for the device block size when it is no
+ * larger than 4K; otherwise keep the direct I/O / physical block size
+ * selected above.
+ */
+ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+ zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
pr_err("Zone capacity is not aligned to block size %u\n",
@@ -1279,6 +1290,10 @@ static int zloop_ctl_add(struct zloop_options *opts)
lim.physical_block_size = zlo->block_size;
lim.logical_block_size = zlo->block_size;
+ /* Direct I/O forwards the request pages to the backing files as-is. */
+ if (!opts->buffered_io)
+ lim.dma_alignment = max_t(unsigned int, zlo->dio_mem_align,
+ SECTOR_SIZE - 1);
if (zlo->zone_append)
lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
lim.max_open_zones = zlo->max_open_zones;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH v3 5/5] block: validate user space vectors during extraction
2026-06-24 17:09 [PATCH v3 0/5] block: validate direct I/O memory alignment Keith Busch
` (3 preceding siblings ...)
2026-06-24 17:09 ` [PATCH v3 4/5] zloop: set dma_alignment from the backing files " Keith Busch
@ 2026-06-24 17:09 ` Keith Busch
4 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2026-06-24 17:09 UTC (permalink / raw)
To: linux-block, linux-fsdevel
Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch, stable
From: Keith Busch <kbusch@kernel.org>
The bio-based drivers don't necessarily check the alignment split, and
stacking block drivers don't always handle a misalignment detected after
submitting the bio. Validate user vectors against the device's
dma_alignment as the bio is built from the iov_iter, rejecting
misaligned early with -EINVAL.
Cc: stable@vger.kernel.org
Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 56 +++++++++++++++++++++++++++++++++++++++++---
block/blk-map.c | 2 +-
block/fops.c | 2 +-
fs/iomap/direct-io.c | 1 +
include/linux/bio.h | 2 +-
include/linux/uio.h | 10 +++++++-
lib/iov_iter.c | 9 ++++++-
7 files changed, 74 insertions(+), 8 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index f2a5f4d0a9672..faad41a72ac77 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1220,10 +1220,45 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
return 0;
}
+#ifdef CONFIG_DEBUG_KERNEL
+static inline bool bio_iov_bvec_aligned(const struct bio *bio,
+ unsigned mem_align_mask)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ /*
+ * Correct callers never break the alignment requirements, so this
+ * exhaustive check is only paid for in debug builds.
+ */
+ for_each_mp_bvec(bv, bio->bi_io_vec, iter, bio->bi_iter)
+ if ((bv.bv_offset | bv.bv_len) & mem_align_mask)
+ return false;
+ return true;
+}
+#else
+static inline bool bio_iov_bvec_aligned(const struct bio *bio,
+ unsigned mem_align_mask)
+{
+ /*
+ * We forward the bio_vec as-is, so ITER_BVEC callers must provide
+ * segments already aligned to the device's DMA alignment. The only
+ * unchecked user-controllable offset that reaches here is an io_uring
+ * registered buffer where just the first segment can be unaligned
+ * (the rest is virtually contiguous), so checking only that one is
+ * sufficient to know if the entire vector is valid.
+ */
+ return !(mp_bvec_iter_offset(bio->bi_io_vec, bio->bi_iter) &
+ mem_align_mask);
+}
+#endif
+
/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be added
+ * @mem_align_mask: the mask the source address and length must be aligned to,
+ * 0 for no requirement
* @len_align_mask: the mask to align the total size to, 0 for any length
*
* This takes either an iterator pointing to user memory, or one pointing to
@@ -1242,7 +1277,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
* is returned only if 0 pages could be pinned.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
- unsigned len_align_mask)
+ unsigned mem_align_mask, unsigned len_align_mask)
{
iov_iter_extraction_t flags = 0;
@@ -1251,6 +1286,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
if (iov_iter_is_bvec(iter)) {
bio_iov_bvec_set(bio, iter);
+
+ if (!bio_iov_bvec_aligned(bio, mem_align_mask))
+ return -EINVAL;
+
iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
@@ -1265,8 +1304,19 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
BIO_MAX_SIZE - bio->bi_iter.bi_size,
- &bio->bi_vcnt, bio->bi_max_vecs, flags);
+ &bio->bi_vcnt, bio->bi_max_vecs,
+ mem_align_mask, flags);
if (ret <= 0) {
+ /*
+ * A misaligned vector fails the whole I/O. Release any
+ * pages pinned by earlier iterations before returning
+ * since this bio won't be submitted to release them.
+ */
+ if (ret == -EINVAL) {
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_PAGE_PINNED);
+ bio->bi_vcnt = 0;
+ }
if (!bio->bi_vcnt)
return ret;
break;
@@ -1377,7 +1427,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
ssize_t ret;
ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
- &bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
+ &bio->bi_vcnt, bio->bi_max_vecs - 1, 0, 0);
if (ret <= 0) {
if (!bio->bi_vcnt) {
folio_put(folio);
diff --git a/block/blk-map.c b/block/blk-map.c
index 768549f19f97e..c9535efe1a913 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -274,7 +274,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
* No alignment requirements on our part to support arbitrary
* passthrough commands.
*/
- ret = bio_iov_iter_get_pages(bio, iter, 0);
+ ret = bio_iov_iter_get_pages(bio, iter, 0, 0);
if (ret)
goto out_put;
ret = blk_rq_append_bio(rq, bio);
diff --git a/block/fops.c b/block/fops.c
index 0098a90a956e1..e519d7f43b310 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -46,7 +46,7 @@ static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
static inline int blkdev_iov_iter_get_pages(struct bio *bio,
struct iov_iter *iter, struct block_device *bdev)
{
- return bio_iov_iter_get_pages(bio, iter,
+ return bio_iov_iter_get_pages(bio, iter, bdev_dma_alignment(bdev),
bdev_logical_block_size(bdev) - 1);
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b485e3b191daf..ff458aa12ae29 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -358,6 +358,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
iomap_max_bio_size(&iter->iomap), alignment);
else
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
+ bdev_dma_alignment(bio->bi_bdev),
alignment - 1);
if (unlikely(ret))
goto out_put_bio;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8f33f717b14f5..ce34ea49ef358 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -477,7 +477,7 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
size_t len, enum req_op op);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
- unsigned len_align_mask);
+ unsigned mem_align_mask, unsigned len_align_mask);
void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index a9bc5b3067e32..fe2e985d74d24 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -389,9 +389,17 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
size_t maxsize, unsigned int maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0);
+/*
+ * Block-layer consumers (e.g. bio_iov_iter_get_pages()) require that the
+ * segments of an ITER_BVEC iterator are already aligned to the target device's
+ * DMA alignment, and forward them as-is. In-kernel users that build their own
+ * bvecs must not create sub-aligned segments; iov_iter_extract_bvecs() enforces
+ * the same for the segments it extracts via @mem_align_mask.
+ */
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
size_t max_size, unsigned short *nr_vecs,
- unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
+ unsigned short max_vecs, unsigned mem_align_mask,
+ iov_iter_extraction_t extraction_flags);
/**
* iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 273919b161617..c343075951ded 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1886,6 +1886,8 @@ static unsigned int get_contig_folio_len(struct page **pages,
* @max_size: maximum size to extract from @iter
* @nr_vecs: number of vectors in @bv (on in and output)
* @max_vecs: maximum vectors in @bv, including those filled before calling
+ * @mem_align_mask: reject with -EINVAL if the source address or
+ * length is not aligned to this mask
* @extraction_flags: flags to qualify request
*
* Like iov_iter_extract_pages(), but returns physically contiguous ranges
@@ -1897,14 +1899,19 @@ static unsigned int get_contig_folio_len(struct page **pages,
*/
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
size_t max_size, unsigned short *nr_vecs,
- unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
+ unsigned short max_vecs, unsigned mem_align_mask,
+ iov_iter_extraction_t extraction_flags)
{
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
unsigned short entries_left = max_vecs - *nr_vecs;
unsigned short nr_pages, i = 0;
size_t left, offset, len;
struct page **pages;
ssize_t size;
+ if ((start | iter_iov_len(iter)) & mem_align_mask)
+ return -EINVAL;
+
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 6+ messages in thread