Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 07/16] ext4: Make ext4_bio_write_folio() return void
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Since the fs-layer file contents encryption implementation was removed,
ext4_bio_write_folio() now always returns 0.  Change it to return void,
and likewise for its caller mpage_submit_folio().

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/inode.c   | 31 ++++++++-----------------------
 fs/ext4/page-io.c |  6 ++----
 3 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b37c136ea3ab..920a8ec1b948 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3943,11 +3943,11 @@ extern int ext4_put_io_end(ext4_io_end_t *io_end);
 extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
 extern void ext4_io_submit_init(struct ext4_io_submit *io,
 				struct writeback_control *wbc);
 extern void ext4_end_io_rsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
-int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
+void ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
 		size_t len);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
 
 /* mmp.c */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8eb2af481129..c6faa7c751ca 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2062,15 +2062,14 @@ static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
 	mpd->start_pos += folio_size(folio);
 	mpd->wbc->nr_to_write -= folio_nr_pages(folio);
 	folio_unlock(folio);
 }
 
-static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
+static void mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
 {
 	size_t len;
 	loff_t size;
-	int err;
 
 	WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
 	folio_clear_dirty_for_io(folio);
 	/*
 	 * We have to be very careful here!  Nothing protects writeback path
@@ -2088,13 +2087,11 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
 	size = i_size_read(mpd->inode);
 	len = folio_size(folio);
 	if (folio_pos(folio) + len > size &&
 	    !ext4_verity_in_progress(mpd->inode))
 		len = size & (len - 1);
-	err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
-
-	return err;
+	ext4_bio_write_folio(&mpd->io_submit, folio, len);
 }
 
 #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
 
 /*
@@ -2167,20 +2164,18 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
  * Walk through page buffers from @bh upto @head (exclusive) and either submit
  * the page for IO if all buffers in this page were mapped and there's no
  * accumulated extent of buffers to map or add buffers in the page to the
  * extent of buffers to map. The function returns 1 if the caller can continue
  * by processing the next page, 0 if it should stop adding buffers to the
- * extent to map because we cannot extend it anymore. It can also return value
- * < 0 in case of error during IO submission.
+ * extent to map because we cannot extend it anymore.
  */
 static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 				   struct buffer_head *head,
 				   struct buffer_head *bh,
 				   ext4_lblk_t lblk)
 {
 	struct inode *inode = mpd->inode;
-	int err;
 	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
 							>> inode->i_blkbits;
 
 	if (ext4_verity_in_progress(inode))
 		blocks = EXT_MAX_BLOCKS;
@@ -2199,13 +2194,11 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 			break;
 		}
 	} while (lblk++, (bh = bh->b_this_page) != head);
 	/* So far everything mapped? Submit the page for IO. */
 	if (mpd->map.m_len == 0) {
-		err = mpage_submit_folio(mpd, head->b_folio);
-		if (err < 0)
-			return err;
+		mpage_submit_folio(mpd, head->b_folio);
 		mpage_folio_done(mpd, head->b_folio);
 	}
 	if (lblk >= blocks) {
 		mpd->scanned_until_end = 1;
 		return 0;
@@ -2331,13 +2324,11 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 			 * So we return to call further extent mapping.
 			 */
 			if (err < 0 || map_bh)
 				goto out;
 			/* Page fully mapped - let IO run! */
-			err = mpage_submit_folio(mpd, folio);
-			if (err < 0)
-				goto out;
+			mpage_submit_folio(mpd, folio);
 			mpage_folio_done(mpd, folio);
 		}
 		folio_batch_release(&fbatch);
 	}
 	/* Extent fully mapped and matches with page boundary. We are done. */
@@ -2406,11 +2397,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
 {
 	struct inode *inode = mpd->inode;
 	struct folio *folio;
 	loff_t pos;
-	int ret;
 
 	folio = filemap_get_folio(inode->i_mapping,
 				  mpd->start_pos >> PAGE_SHIFT);
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
@@ -2421,25 +2411,22 @@ static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
 	pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
 	if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
 			 !folio_contains(folio, pos >> PAGE_SHIFT)))
 		return -EINVAL;
 
-	ret = mpage_submit_folio(mpd, folio);
-	if (ret)
-		goto out;
+	mpage_submit_folio(mpd, folio);
 	/*
 	 * Update start_pos to prevent this folio from being released in
 	 * mpage_release_unused_pages(), it will be reset to the aligned folio
 	 * pos when this folio is written again in the next round. Additionally,
 	 * do not update wbc->nr_to_write here, as it will be updated once the
 	 * entire folio has finished processing.
 	 */
 	mpd->start_pos = pos;
-out:
 	folio_unlock(folio);
 	folio_put(folio);
-	return ret;
+	return 0;
 }
 
 /*
  * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
  *				 mpd->len and submit pages underlying it for IO
@@ -2722,13 +2709,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			 * location before possibly journalling it again which
 			 * is desirable when the page is frequently dirtied
 			 * through a pin.
 			 */
 			if (!mpd->can_map) {
-				err = mpage_submit_folio(mpd, folio);
-				if (err < 0)
-					goto out;
+				mpage_submit_folio(mpd, folio);
 				/* Pending dirtying of journalled data? */
 				if (folio_test_checked(folio)) {
 					err = mpage_journal_page_buffers(handle,
 						mpd, folio);
 					if (err < 0)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 557f44178d87..0236b6b9785a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -457,11 +457,11 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
 		goto submit_and_retry;
 	wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
 	io->io_next_block++;
 }
 
-int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
+void ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 		size_t len)
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned block_start;
 	struct buffer_head *bh, *head;
@@ -531,11 +531,11 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 		 * We have nothing to submit. Just cycle the folio through
 		 * writeback state to properly update xarray tags.
 		 */
 		__folio_start_writeback(folio, keep_towrite);
 		folio_end_writeback(folio);
-		return 0;
+		return;
 	}
 
 	bh = head = folio_buffers(folio);
 
 	__folio_start_writeback(folio, keep_towrite);
@@ -544,8 +544,6 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 	do {
 		if (!buffer_async_write(bh))
 			continue;
 		io_submit_add_bh(io, inode, folio, bh);
 	} while ((bh = bh->b_this_page) != head);
-
-	return 0;
 }
-- 
2.54.0


^ permalink raw reply related

* [PATCH 08/16] ext4: Further de-generalize the bio postprocessing code
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Since the bio postprocessing code in fs/ext4/readpage.c is now used only
for fsverity, rename things accordingly.  Also don't create the caches
at all when !CONFIG_FS_VERITY.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/ext4/ext4.h     |  4 +--
 fs/ext4/readpage.c | 66 ++++++++++++++++++++++------------------------
 fs/ext4/super.c    |  6 ++---
 3 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 920a8ec1b948..af6cf0bbc5e2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3827,12 +3827,12 @@ static inline void ext4_set_de_type(struct super_block *sb,
 }
 
 /* readpages.c */
 int ext4_read_folio(struct file *file, struct folio *folio);
 void ext4_readahead(struct readahead_control *rac);
-extern int __init ext4_init_post_read_processing(void);
-extern void ext4_exit_post_read_processing(void);
+extern int __init ext4_init_verity_caches(void);
+extern void ext4_exit_verity_caches(void);
 
 /* symlink.c */
 extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 8af183798a33..322226290e65 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -45,16 +45,16 @@
 #include <linux/backing-dev.h>
 
 #include "ext4.h"
 #include <trace/events/ext4.h>
 
-#define NUM_PREALLOC_POST_READ_CTXS	128
+#define NUM_VERITY_WORKS 128
 
-static struct kmem_cache *bio_post_read_ctx_cache;
-static mempool_t *bio_post_read_ctx_pool;
+static struct kmem_cache *ext4_verity_work_cache;
+static mempool_t *ext4_verity_work_pool;
 
-struct bio_post_read_ctx {
+struct ext4_verity_work {
 	struct bio *bio;
 	struct fsverity_info *vi;
 	struct work_struct work;
 };
 
@@ -63,39 +63,33 @@ static void __read_end_io(struct bio *bio)
 	struct folio_iter fi;
 
 	bio_for_each_folio_all(fi, bio)
 		folio_end_read(fi.folio, bio->bi_status == 0);
 	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+		mempool_free(bio->bi_private, ext4_verity_work_pool);
 	bio_put(bio);
 }
 
 static void verity_work(struct work_struct *work)
 {
-	struct bio_post_read_ctx *ctx =
-		container_of(work, struct bio_post_read_ctx, work);
+	struct ext4_verity_work *ctx =
+		container_of(work, struct ext4_verity_work, work);
 	struct bio *bio = ctx->bio;
 	struct fsverity_info *vi = ctx->vi;
 
 	/*
-	 * Free the bio_post_read_ctx right away, since it's no longer needed.
+	 * Free the ext4_verity_work right away, since it's no longer needed.
 	 * This relieves the pressure on the mempool as much as possible.
 	 */
-	mempool_free(ctx, bio_post_read_ctx_pool);
+	mempool_free(ctx, ext4_verity_work_pool);
 	bio->bi_private = NULL;
 
 	fsverity_verify_bio(vi, bio);
 
 	__read_end_io(bio);
 }
 
-static bool bio_post_read_required(struct bio *bio)
-{
-	return IS_ENABLED(CONFIG_FS_VERITY) && bio->bi_private &&
-	       !bio->bi_status;
-}
-
 /*
  * I/O completion handler for multipage BIOs.
  *
  * The mpage code never puts partial pages into a BIO (except for end-of-file).
  * If a page does not map to a contiguous run of blocks then it simply falls
@@ -106,28 +100,28 @@ static bool bio_post_read_required(struct bio *bio)
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
 static void mpage_end_io(struct bio *bio)
 {
-	if (bio_post_read_required(bio)) {
-		struct bio_post_read_ctx *ctx = bio->bi_private;
+	if (IS_ENABLED(CONFIG_FS_VERITY) && bio->bi_private &&
+	    !bio->bi_status) {
+		struct ext4_verity_work *ctx = bio->bi_private;
 
 		INIT_WORK(&ctx->work, verity_work);
 		fsverity_enqueue_verify_work(&ctx->work);
 		return;
 	}
 	__read_end_io(bio);
 }
 
-static void ext4_set_bio_post_read_ctx(struct bio *bio,
-				       const struct inode *inode,
-				       struct fsverity_info *vi)
+static void ext4_set_verity_work(struct bio *bio, const struct inode *inode,
+				 struct fsverity_info *vi)
 {
 	if (vi) {
 		/* Due to the mempool, this never fails. */
-		struct bio_post_read_ctx *ctx =
-			mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+		struct ext4_verity_work *ctx =
+			mempool_alloc(ext4_verity_work_pool, GFP_NOFS);
 
 		ctx->bio = bio;
 		ctx->vi = vi;
 		bio->bi_private = ctx;
 	}
@@ -287,11 +281,11 @@ static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi,
 			 * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
 			 */
 			bio = bio_alloc(bdev, bio_max_segs(nr_pages),
 					REQ_OP_READ, GFP_KERNEL);
 			fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_KERNEL);
-			ext4_set_bio_post_read_ctx(bio, inode, vi);
+			ext4_set_verity_work(bio, inode, vi);
 			bio->bi_iter.bi_sector = first_block << (blkbits - 9);
 			bio->bi_end_io = mpage_end_io;
 			if (rac)
 				bio->bi_opf |= REQ_RAHEAD;
 		}
@@ -361,29 +355,33 @@ void ext4_readahead(struct readahead_control *rac)
 		fsverity_readahead(vi, readahead_index(rac),
 				   readahead_count(rac));
 	ext4_mpage_readpages(inode, vi, rac, NULL);
 }
 
-int __init ext4_init_post_read_processing(void)
+int __init ext4_init_verity_caches(void)
 {
-	bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+	if (!IS_ENABLED(CONFIG_FS_VERITY))
+		return 0;
+	ext4_verity_work_cache =
+		KMEM_CACHE(ext4_verity_work, SLAB_RECLAIM_ACCOUNT);
 
-	if (!bio_post_read_ctx_cache)
+	if (!ext4_verity_work_cache)
 		goto fail;
-	bio_post_read_ctx_pool =
-		mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
-					 bio_post_read_ctx_cache);
-	if (!bio_post_read_ctx_pool)
+	ext4_verity_work_pool = mempool_create_slab_pool(
+		NUM_VERITY_WORKS, ext4_verity_work_cache);
+	if (!ext4_verity_work_pool)
 		goto fail_free_cache;
 	return 0;
 
 fail_free_cache:
-	kmem_cache_destroy(bio_post_read_ctx_cache);
+	kmem_cache_destroy(ext4_verity_work_cache);
 fail:
 	return -ENOMEM;
 }
 
-void ext4_exit_post_read_processing(void)
+void ext4_exit_verity_caches(void)
 {
-	mempool_destroy(bio_post_read_ctx_pool);
-	kmem_cache_destroy(bio_post_read_ctx_cache);
+	if (!IS_ENABLED(CONFIG_FS_VERITY))
+		return;
+	mempool_destroy(ext4_verity_work_pool);
+	kmem_cache_destroy(ext4_verity_work_cache);
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 245f67d10ded..cb9ca0dc4664 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -7529,11 +7529,11 @@ static int __init ext4_init_fs(void)
 
 	err = ext4_init_pending();
 	if (err)
 		goto out7;
 
-	err = ext4_init_post_read_processing();
+	err = ext4_init_verity_caches();
 	if (err)
 		goto out6;
 
 	err = ext4_init_pageio();
 	if (err)
@@ -7578,11 +7578,11 @@ static int __init ext4_init_fs(void)
 out3:
 	ext4_exit_system_zone();
 out4:
 	ext4_exit_pageio();
 out5:
-	ext4_exit_post_read_processing();
+	ext4_exit_verity_caches();
 out6:
 	ext4_exit_pending();
 out7:
 	ext4_exit_es();
 
@@ -7599,11 +7599,11 @@ static void __exit ext4_exit_fs(void)
 	destroy_inodecache();
 	ext4_exit_mballoc();
 	ext4_exit_sysfs();
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
-	ext4_exit_post_read_processing();
+	ext4_exit_verity_caches();
 	ext4_exit_es();
 	ext4_exit_pending();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
2.54.0


^ permalink raw reply related

* [PATCH 09/16] f2fs: Remove fs-layer file contents en/decryption code
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Now that fscrypt's file contents en/decryption is always implemented
using blk-crypto when the filesystem is block-based, the fs-layer
en/decryption code in f2fs is unused code.  Remove it.

Note that the struct f2fs_io_info field encrypted_page is kept because
it is still used by the garbage collection path to relocate encrypted
blocks using raw meta pages from META_MAPPING.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/f2fs/compress.c | 28 ++------------
 fs/f2fs/data.c     | 93 +++++-----------------------------------------
 fs/f2fs/f2fs.h     |  2 -
 fs/f2fs/segment.c  |  2 -
 fs/f2fs/super.c    |  1 -
 5 files changed, 12 insertions(+), 114 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 881e76158b96..e0ad9ba315b4 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1282,12 +1282,10 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 		.page = NULL,
 		.encrypted_page = NULL,
 		.compressed_page = NULL,
 		.io_type = io_type,
 		.io_wbc = wbc,
-		.encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ?
-									1 : 0,
 	};
 	struct folio *folio;
 	struct dnode_of_data dn;
 	struct node_info ni;
 	struct compress_io_ctx *cic;
@@ -1357,18 +1355,10 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 		fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_folio,
 						dn.ofs_in_node + i + 1);
 
 		/* wait for GCed page writeback via META_MAPPING */
 		f2fs_wait_on_block_writeback(inode, fio.old_blkaddr);
-
-		if (fio.encrypted) {
-			fio.page = cc->rpages[i + 1];
-			err = f2fs_encrypt_one_page(&fio);
-			if (err)
-				goto out_destroy_crypt;
-			cc->cpages[i] = fio.encrypted_page;
-		}
 	}
 
 	set_cluster_writeback(cc);
 
 	for (i = 0; i < cc->cluster_size; i++)
@@ -1402,25 +1392,19 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 			goto unlock_continue;
 		}
 
 		f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR);
 
-		if (fio.encrypted)
-			fio.encrypted_page = cc->cpages[i - 1];
-		else
-			fio.compressed_page = cc->cpages[i - 1];
+		fio.compressed_page = cc->cpages[i - 1];
 
 		cc->cpages[i - 1] = NULL;
 		fio.submitted = 0;
 		f2fs_outplace_write_data(&dn, &fio);
 		if (unlikely(!fio.submitted)) {
 			cancel_cluster_writeback(cc, cic, i);
-
-			/* To call fscrypt_finalize_bounce_page */
-			i = cc->valid_nr_cpages;
 			*submitted = 0;
-			goto out_destroy_crypt;
+			goto out_free_page_array;
 		}
 		(*submitted)++;
 unlock_continue:
 		inode_dec_dirty_pages(cc->inode);
 		folio_unlock(fio.folio);
@@ -1448,18 +1432,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 	page_array_free(sbi, cc->cpages, cc->nr_cpages);
 	cc->cpages = NULL;
 	f2fs_destroy_compress_ctx(cc, false);
 	return 0;
 
-out_destroy_crypt:
+out_free_page_array:
 	page_array_free(sbi, cic->rpages, cc->cluster_size);
-
-	for (--i; i >= 0; i--) {
-		if (!cc->cpages[i])
-			continue;
-		fscrypt_finalize_bounce_page(&cc->cpages[i]);
-	}
 out_put_cic:
 	kmem_cache_free(cic_entry_slab, cic);
 out_put_dnode:
 	f2fs_put_dnode(&dn);
 out_unlock_op:
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8d4f1e75dee3..315bfe40da87 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -57,13 +57,10 @@ bool f2fs_is_cp_guaranteed(const struct folio *folio)
 {
 	struct address_space *mapping = folio->mapping;
 	struct inode *inode;
 	struct f2fs_sb_info *sbi;
 
-	if (fscrypt_is_bounce_folio(folio))
-		return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio));
-
 	inode = mapping->host;
 	sbi = F2FS_I_SB(inode);
 
 	if (inode->i_ino == F2FS_META_INO(sbi) ||
 			inode->i_ino == F2FS_NODE_INO(sbi) ||
@@ -93,15 +90,10 @@ static enum count_type __read_io_type(struct folio *folio)
 	return F2FS_RD_DATA;
 }
 
 /* postprocessing steps for read bios */
 enum bio_post_read_step {
-#ifdef CONFIG_FS_ENCRYPTION
-	STEP_DECRYPT	= BIT(0),
-#else
-	STEP_DECRYPT	= 0,	/* compile out the decryption-related code */
-#endif
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	STEP_DECOMPRESS	= BIT(1),
 #else
 	STEP_DECOMPRESS	= 0,	/* compile out the decompression-related code */
 #endif
@@ -293,15 +285,10 @@ static void f2fs_post_read_work(struct work_struct *work)
 {
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);
 	struct bio *bio = ctx->bio;
 
-	if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) {
-		f2fs_finish_read_bio(bio, true);
-		return;
-	}
-
 	if (ctx->enabled_steps & STEP_DECOMPRESS)
 		f2fs_handle_step_decompress(ctx, true);
 
 	f2fs_verify_and_finish_bio(bio, true);
 }
@@ -321,22 +308,15 @@ static void f2fs_read_end_io(struct bio *bio)
 	if (bio->bi_status != BLK_STS_OK) {
 		f2fs_finish_read_bio(bio, intask);
 		return;
 	}
 
-	if (ctx) {
-		unsigned int enabled_steps = ctx->enabled_steps &
-					(STEP_DECRYPT | STEP_DECOMPRESS);
-
-		/*
-		 * If we have only decompression step between decompression and
-		 * decrypt, we don't need post processing for this.
-		 */
-		if (enabled_steps == STEP_DECOMPRESS &&
-				!f2fs_low_mem_mode(sbi)) {
+	if (ctx && (ctx->enabled_steps & STEP_DECOMPRESS)) {
+		if (!f2fs_low_mem_mode(sbi)) {
+			/* Decompress inline. */
 			f2fs_handle_step_decompress(ctx, intask);
-		} else if (enabled_steps) {
+		} else {
 			INIT_WORK(&ctx->work, f2fs_post_read_work);
 			queue_work(ctx->sbi->post_read_wq, &ctx->work);
 			return;
 		}
 	}
@@ -357,17 +337,10 @@ static void f2fs_write_end_io(struct bio *bio)
 
 	bio_for_each_folio_all(fi, bio) {
 		struct folio *folio = fi.folio;
 		enum count_type type;
 
-		if (fscrypt_is_bounce_folio(folio)) {
-			struct folio *io_folio = folio;
-
-			folio = fscrypt_pagecache_folio(io_folio);
-			fscrypt_free_bounce_page(&io_folio->page);
-		}
-
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		if (f2fs_is_compressed_page(folio)) {
 			f2fs_compress_write_end_io(bio, folio);
 			continue;
 		}
@@ -599,15 +572,10 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
 		return true;
 
 	bio_for_each_folio_all(fi, bio) {
 		struct folio *target = fi.folio;
 
-		if (fscrypt_is_bounce_folio(target)) {
-			target = fscrypt_pagecache_folio(target);
-			if (IS_ERR(target))
-				continue;
-		}
 		if (f2fs_is_compressed_page(target)) {
 			target = f2fs_compress_control_folio(target);
 			if (IS_ERR(target))
 				continue;
 		}
@@ -1117,13 +1085,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode,
 			       for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
 	bio->bi_iter.bi_sector = sector;
 	f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
 	bio->bi_end_io = f2fs_read_end_io;
 
-	if (fscrypt_inode_uses_fs_layer_crypto(inode))
-		post_read_steps |= STEP_DECRYPT;
-
 	if (vi)
 		post_read_steps |= STEP_VERITY;
 
 	/*
 	 * STEP_DECOMPRESS is handled specially, since a compressed file might
@@ -2808,39 +2773,10 @@ static void f2fs_readahead(struct readahead_control *rac)
 		fsverity_readahead(vi, readahead_index(rac),
 				   readahead_count(rac));
 	f2fs_mpage_readpages(inode, vi, rac, NULL);
 }
 
-int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
-{
-	struct inode *inode = fio_inode(fio);
-	struct folio *mfolio;
-	struct page *page;
-
-	if (!f2fs_encrypted_file(inode))
-		return 0;
-
-	page = fio->compressed_page ? fio->compressed_page : fio->page;
-
-	if (fscrypt_inode_uses_inline_crypto(inode))
-		return 0;
-
-	fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page),
-					PAGE_SIZE, 0, GFP_NOFS);
-	if (IS_ERR(fio->encrypted_page))
-		return PTR_ERR(fio->encrypted_page);
-
-	mfolio = filemap_lock_folio(META_MAPPING(fio->sbi), fio->old_blkaddr);
-	if (!IS_ERR(mfolio)) {
-		if (folio_test_uptodate(mfolio))
-			memcpy(folio_address(mfolio),
-				page_address(fio->encrypted_page), PAGE_SIZE);
-		f2fs_folio_put(mfolio, true);
-	}
-	return 0;
-}
-
 static inline bool check_inplace_update_policy(struct inode *inode,
 				struct f2fs_io_info *fio)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
@@ -3009,26 +2945,19 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	 * it had better in-place writes for updated data.
 	 */
 	if (ipu_force ||
 		(__is_valid_data_blkaddr(fio->old_blkaddr) &&
 					need_inplace_update(fio))) {
-		err = f2fs_encrypt_one_page(fio);
-		if (err)
-			goto out_writepage;
-
 		folio_start_writeback(folio);
 		f2fs_put_dnode(&dn);
 		if (fio->need_lock == LOCK_REQ)
 			f2fs_unlock_op(fio->sbi, &lc);
 		err = f2fs_inplace_write_data(fio);
-		if (err) {
-			if (fscrypt_inode_uses_fs_layer_crypto(inode))
-				fscrypt_finalize_bounce_page(&fio->encrypted_page);
+		if (err)
 			folio_end_writeback(folio);
-		} else {
+		else
 			set_inode_flag(inode, FI_UPDATE_WRITE);
-		}
 		trace_f2fs_do_write_data_page(folio, IPU);
 		return err;
 	}
 
 	if (fio->need_lock == LOCK_RETRY) {
@@ -3043,14 +2972,10 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	if (err)
 		goto out_writepage;
 
 	fio->version = ni.version;
 
-	err = f2fs_encrypt_one_page(fio);
-	if (err)
-		goto out_writepage;
-
 	folio_start_writeback(folio);
 
 	if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR)
 		f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false);
 
@@ -4547,13 +4472,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return err;
 
 	iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk);
 
 	/*
-	 * When inline encryption is enabled, sometimes I/O to an encrypted file
-	 * has to be broken up to guarantee DUN contiguity.  Handle this by
-	 * limiting the length of the mapping returned.
+	 * Sometimes I/O to an encrypted file has to be broken up to guarantee
+	 * DUN contiguity.  Handle this by limiting the length of the mapping
+	 * returned.
 	 */
 	map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
 
 	/*
 	 * We should never see delalloc or compressed extents here based on
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 91f506e7c9cf..746e678ceb1a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1353,11 +1353,10 @@ struct f2fs_io_info {
 	unsigned int need_lock:8;	/* indicate we need to lock cp_rwsem */
 	unsigned int version:8;		/* version of the node */
 	unsigned int submitted:1;	/* indicate IO submission */
 	unsigned int in_list:1;		/* indicate fio is in io_list */
 	unsigned int is_por:1;		/* indicate IO is from recovery or not */
-	unsigned int encrypted:1;	/* indicate file is encrypted */
 	unsigned int meta_gc:1;		/* require meta inode GC */
 	enum iostat_type io_type;	/* io type */
 	struct writeback_control *io_wbc; /* writeback control */
 	struct bio **bio;		/* bio for ipu */
 	sector_t *last_block;		/* last block number in bio */
@@ -4176,11 +4175,10 @@ struct folio *f2fs_get_new_data_folio(struct inode *inode,
 			struct folio *ifolio, pgoff_t index, bool new_i_size);
 int f2fs_do_write_data_page(struct f2fs_io_info *fio);
 int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
-int f2fs_encrypt_one_page(struct f2fs_io_info *fio);
 bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
 bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
 int f2fs_write_single_data_page(struct folio *folio, int *submitted,
 				struct bio **bio, sector_t *last_block,
 				struct writeback_control *wbc,
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 788f8b050249..e45eb0ff961d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3985,12 +3985,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 	if (unlikely(err)) {
 		f2fs_err_ratelimited(fio->sbi,
 			"%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d",
 			__func__, fio->ino, folio->index, type,
 			fio->old_blkaddr, fio->new_blkaddr, err);
-		if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
-			fscrypt_finalize_bounce_page(&fio->encrypted_page);
 		folio_end_writeback(folio);
 		if (f2fs_in_warm_node_list(folio))
 			f2fs_del_fsync_node_entry(fio->sbi, folio);
 		f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi,
 							CP_ERROR_FLAG));
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f3f6768f8cca..fd9d3ea4c058 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3753,11 +3753,10 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 
 static const struct fscrypt_operations f2fs_cryptops = {
 	.inode_info_offs	= (int)offsetof(struct f2fs_inode_info, i_crypt_info) -
 				  (int)offsetof(struct f2fs_inode_info, vfs_inode),
 	.is_block_based		= 1,
-	.needs_bounce_pages	= 1,
 	.has_32bit_inodes	= 1,
 	.supports_subblock_data_units = 1,
 	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
-- 
2.54.0


^ permalink raw reply related

* [PATCH 10/16] fs/buffer: Remove fs-layer decryption code
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Now that fscrypt's file contents en/decryption is always implemented
using blk-crypto when the filesystem is block-based, the fs-layer
decryption code in fs/buffer.c is unused code.  Remove it.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/buffer.c | 45 ++++++++-------------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 9af5f061a1f8..21dd9596a941 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -334,82 +334,53 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 
 still_busy:
 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 }
 
-struct postprocess_bh_ctx {
+struct verify_bh_ctx {
 	struct work_struct work;
 	struct buffer_head *bh;
 	struct fsverity_info *vi;
 };
 
 static void verify_bh(struct work_struct *work)
 {
-	struct postprocess_bh_ctx *ctx =
-		container_of(work, struct postprocess_bh_ctx, work);
+	struct verify_bh_ctx *ctx =
+		container_of(work, struct verify_bh_ctx, work);
 	struct buffer_head *bh = ctx->bh;
 	bool valid;
 
 	valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
 				       bh_offset(bh));
 	end_buffer_async_read(bh, valid);
 	kfree(ctx);
 }
 
-static void decrypt_bh(struct work_struct *work)
-{
-	struct postprocess_bh_ctx *ctx =
-		container_of(work, struct postprocess_bh_ctx, work);
-	struct buffer_head *bh = ctx->bh;
-	int err;
-
-	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
-					       bh_offset(bh));
-	if (err == 0 && ctx->vi) {
-		/*
-		 * We use different work queues for decryption and for verity
-		 * because verity may require reading metadata pages that need
-		 * decryption, and we shouldn't recurse to the same workqueue.
-		 */
-		INIT_WORK(&ctx->work, verify_bh);
-		fsverity_enqueue_verify_work(&ctx->work);
-		return;
-	}
-	end_buffer_async_read(bh, err == 0);
-	kfree(ctx);
-}
-
 /*
  * I/O completion handler for block_read_full_folio() - folios
  * which come unlocked at the end of I/O.
  */
 static void bh_end_async_read(struct bio *bio)
 {
 	struct buffer_head *bh;
 	bool uptodate = bio_endio_bh(bio, &bh);
 	struct inode *inode = bh->b_folio->mapping->host;
-	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
 	struct fsverity_info *vi = NULL;
 
 	/* needed by ext4 */
 	if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
 		vi = fsverity_get_info(inode);
 
-	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
-	if (uptodate && (decrypt || vi)) {
-		struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
+	/* Verify (with fsverity) if needed. */
+	if (vi && uptodate) {
+		struct verify_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
 
 		if (ctx) {
 			ctx->bh = bh;
 			ctx->vi = vi;
-			if (decrypt) {
-				INIT_WORK(&ctx->work, decrypt_bh);
-				fscrypt_enqueue_decrypt_work(&ctx->work);
-			} else {
-				INIT_WORK(&ctx->work, verify_bh);
-				fsverity_enqueue_verify_work(&ctx->work);
-			}
+			INIT_WORK(&ctx->work, verify_bh);
+			fsverity_enqueue_verify_work(&ctx->work);
 			return;
 		}
 		uptodate = false;
 	}
 	end_buffer_async_read(bh, uptodate);
-- 
2.54.0


^ permalink raw reply related

* [PATCH 11/16] fscrypt: Replace calls to fscrypt_inode_uses_inline_crypto()
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Now that fscrypt's file contents en/decryption is always implemented
using blk-crypto when the filesystem is block-based, the calls to
fscrypt_inode_uses_inline_crypto() in fs/crypto/inline_crypt.c (which
contains functions that are called only from block-based filesystems)
are equivalent to checking whether the file is an encrypted regular
file, i.e. fscrypt_needs_contents_encryption().  Use that instead.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/inline_crypt.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index caf706215621..111ea45732f0 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -225,12 +225,12 @@ static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
  * @inode: the file's inode
  * @pos: the first file position (in bytes) in the I/O
  * @gfp_mask: memory allocation flags - these must be a waiting mask so that
  *					bio_crypt_set_ctx can't fail.
  *
- * If the contents of the file should be encrypted (or decrypted) with inline
- * encryption, then assign the appropriate encryption context to the bio.
+ * If the contents of the file should be encrypted (or decrypted), then assign
+ * the appropriate encryption context to the bio.
  *
  * Normally the bio should be newly allocated (i.e. no pages added yet), as
  * otherwise fscrypt_mergeable_bio() won't work as intended.
  *
  * The encryption context will be freed automatically when the bio is freed.
@@ -239,11 +239,11 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       loff_t pos, gfp_t gfp_mask)
 {
 	const struct fscrypt_inode_info *ci;
 	u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
-	if (!fscrypt_inode_uses_inline_crypto(inode))
+	if (!fscrypt_needs_contents_encryption(inode))
 		return;
 	ci = fscrypt_get_inode_info_raw(inode);
 
 	fscrypt_generate_dun(ci, pos, dun);
 	bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
@@ -254,16 +254,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
  * fscrypt_mergeable_bio() - test whether data can be added to a bio
  * @bio: the bio being built up
  * @inode: the inode for the next part of the I/O
  * @pos: the next file position (in bytes) in the I/O
  *
- * When building a bio which may contain data which should undergo inline
- * encryption (or decryption) via fscrypt, filesystems should call this function
- * to ensure that the resulting bio contains only contiguous data unit numbers.
- * This will return false if the next part of the I/O cannot be merged with the
- * bio because either the encryption key would be different or the encryption
- * data unit numbers would be discontiguous.
+ * When building a bio which may contain data which should undergo encryption
+ * (or decryption) via fscrypt, filesystems should call this function to ensure
+ * that the resulting bio contains only contiguous data unit numbers.  This will
+ * return false if the next part of the I/O cannot be merged with the bio
+ * because either the encryption key would be different or the encryption data
+ * unit numbers would be discontiguous.
  *
  * fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
  *
  * This function isn't required in cases where crypto-mergeability is ensured in
  * another way, such as I/O targeting only a single file (and thus a single key)
@@ -276,11 +276,11 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 {
 	const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
 	const struct fscrypt_inode_info *ci;
 	u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
-	if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
+	if (!!bc != fscrypt_needs_contents_encryption(inode))
 		return false;
 	if (!bc)
 		return true;
 	ci = fscrypt_get_inode_info_raw(inode);
 
@@ -334,11 +334,11 @@ bool fscrypt_dio_supported(struct inode *inode)
 		 * Key unavailable or couldn't be set up.  This edge case isn't
 		 * worth worrying about; just report that DIO is unsupported.
 		 */
 		return false;
 	}
-	return fscrypt_inode_uses_inline_crypto(inode);
+	return true;
 }
 EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
 
 /**
  * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
@@ -363,11 +363,11 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 {
 	const struct fscrypt_inode_info *ci;
 	u32 dun;
 
-	if (!fscrypt_inode_uses_inline_crypto(inode))
+	if (!fscrypt_needs_contents_encryption(inode))
 		return nr_blocks;
 
 	if (nr_blocks <= 1)
 		return nr_blocks;
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH 12/16] fscrypt: Remove fscrypt_dio_supported()
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

On block-based filesystems, fscrypt file contents encryption is now
always implemented using blk-crypto.  This implementation supports
direct I/O.

Therefore, fscrypt_dio_supported() now always returns true, except in
the edge case where statx(STATX_DIOALIGN) is called on an encrypted
regular file that hasn't had its key set up.  But that was really a
workaround rather than the desired behavior, so we can disregard it.

Thus, fscrypt_dio_supported() is no longer needed.  Remove it.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/inline_crypt.c | 43 ----------------------------------------
 fs/ext4/inode.c          |  5 +----
 fs/f2fs/file.c           |  2 --
 include/linux/fscrypt.h  |  7 -------
 4 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 111ea45732f0..3c3a46c5af42 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -295,53 +295,10 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 	fscrypt_generate_dun(ci, pos, next_dun);
 	return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
 }
 EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
 
-/**
- * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
- *			     inode, as far as encryption is concerned
- * @inode: the inode in question
- *
- * Return: %true if there are no encryption constraints that prevent DIO from
- *	   being supported; %false if DIO is unsupported.  (Note that in the
- *	   %true case, the filesystem might have other, non-encryption-related
- *	   constraints that prevent DIO from actually being supported.  Also, on
- *	   encrypted files the filesystem is still responsible for only allowing
- *	   DIO when requests are filesystem-block-aligned.)
- */
-bool fscrypt_dio_supported(struct inode *inode)
-{
-	int err;
-
-	/* If the file is unencrypted, no veto from us. */
-	if (!fscrypt_needs_contents_encryption(inode))
-		return true;
-
-	/*
-	 * We only support DIO with inline crypto, not fs-layer crypto.
-	 *
-	 * To determine whether the inode is using inline crypto, we have to set
-	 * up the key if it wasn't already done.  This is because in the current
-	 * design of fscrypt, the decision of whether to use inline crypto or
-	 * not isn't made until the inode's encryption key is being set up.  In
-	 * the DIO read/write case, the key will always be set up already, since
-	 * the file will be open.  But in the case of statx(), the key might not
-	 * be set up yet, as the file might not have been opened yet.
-	 */
-	err = fscrypt_require_key(inode);
-	if (err) {
-		/*
-		 * Key unavailable or couldn't be set up.  This edge case isn't
-		 * worth worrying about; just report that DIO is unsupported.
-		 */
-		return false;
-	}
-	return true;
-}
-EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
-
 /**
  * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
  * @inode: the file on which I/O is being done
  * @lblk: the block at which the I/O is being started from
  * @nr_blocks: the number of blocks we want to submit starting at @lblk
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6faa7c751ca..dd321aaa8779 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6144,15 +6144,12 @@ u32 ext4_dio_alignment(struct inode *inode)
 		return 0;
 	if (ext4_should_journal_data(inode))
 		return 0;
 	if (ext4_has_inline_data(inode))
 		return 0;
-	if (IS_ENCRYPTED(inode)) {
-		if (!fscrypt_dio_supported(inode))
-			return 0;
+	if (IS_ENCRYPTED(inode))
 		return i_blocksize(inode);
-	}
 	return 1; /* use the iomap defaults */
 }
 
 int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
 		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index fb12c5c9affd..a726bc2ab66c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -948,12 +948,10 @@ int f2fs_truncate(struct inode *inode)
 
 static bool f2fs_force_buffered_io(struct inode *inode, int rw)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-	if (!fscrypt_dio_supported(inode))
-		return true;
 	if (fsverity_active(inode))
 		return true;
 	if (f2fs_compressed_file(inode))
 		return true;
 	/*
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 8d19b95150f1..43bafdd67dd7 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -868,12 +868,10 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       loff_t pos, gfp_t gfp_mask);
 
 bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 			   loff_t pos);
 
-bool fscrypt_dio_supported(struct inode *inode);
-
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
 
 #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
 static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
@@ -885,15 +883,10 @@ static inline bool fscrypt_mergeable_bio(struct bio *bio,
 					 loff_t pos)
 {
 	return true;
 }
 
-static inline bool fscrypt_dio_supported(struct inode *inode)
-{
-	return !fscrypt_needs_contents_encryption(inode);
-}
-
 static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
 					  u64 nr_blocks)
 {
 	return nr_blocks;
 }
-- 
2.54.0


^ permalink raw reply related

* [PATCH 13/16] fscrypt: Remove fs-layer zeroout code
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Now that fscrypt's file contents en/decryption is always implemented
using blk-crypto when the filesystem is block-based, the fs-layer
zeroout code in fs/crypto/bio.c is unused code.  Remove it, then fold
fscrypt_zeroout_range_inline_crypt() into fscrypt_zeroout_range().

Then make fscrypt_alloc_bounce_page() and fscrypt_crypt_data_unit()
static, since they're no longer called from any other file.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/bio.c             | 134 +++++++-----------------------------
 fs/crypto/crypto.c          |  14 ++--
 fs/crypto/fscrypt_private.h |   5 --
 3 files changed, 32 insertions(+), 121 deletions(-)

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index d07740680602..58b6b13eeedd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -67,20 +67,40 @@ static void fscrypt_zeroout_range_end_io(struct bio *bio)
 		cmpxchg(&done->status, 0, bio->bi_status);
 	fscrypt_zeroout_range_done(done);
 	bio_put(bio);
 }
 
-static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
-					      loff_t pos, sector_t sector,
-					      u64 len)
+/**
+ * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
+ * @inode: the file's inode
+ * @pos: the first file position (in bytes) to zero out
+ * @sector: the first sector to zero out
+ * @len: bytes to zero out
+ *
+ * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write
+ * ciphertext blocks which decrypt to the all-zeroes block.  The blocks must be
+ * both logically and physically contiguous.  It's also assumed that the
+ * filesystem only uses a single block device, ->s_bdev.  @len must be a
+ * multiple of the file system logical block size.
+ *
+ * Note that since each block uses a different IV, this involves writing a
+ * different ciphertext to each block; we can't simply reuse the same one.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
+int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
+			  sector_t sector, u64 len)
 {
 	struct fscrypt_zero_done done = {
 		.pending	= ATOMIC_INIT(1),
 		.done		= COMPLETION_INITIALIZER_ONSTACK(done.done),
 	};
 
-	while (len) {
+	if (len == 0)
+		return 0;
+
+	do {
 		struct bio *bio;
 		unsigned int n;
 
 		bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
 				GFP_NOFS);
@@ -100,117 +120,13 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 				break;
 		}
 
 		atomic_inc(&done.pending);
 		blk_crypto_submit_bio(bio);
-	}
+	} while (len);
 
 	fscrypt_zeroout_range_done(&done);
 
 	wait_for_completion(&done.done);
 	return blk_status_to_errno(done.status);
 }
-
-/**
- * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
- * @inode: the file's inode
- * @pos: the first file position (in bytes) to zero out
- * @sector: the first sector to zero out
- * @len: bytes to zero out
- *
- * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write
- * ciphertext blocks which decrypt to the all-zeroes block.  The blocks must be
- * both logically and physically contiguous.  It's also assumed that the
- * filesystem only uses a single block device, ->s_bdev.  @len must be a
- * multiple of the file system logical block size.
- *
- * Note that since each block uses a different IV, this involves writing a
- * different ciphertext to each block; we can't simply reuse the same one.
- *
- * Return: 0 on success; -errno on failure.
- */
-int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
-			  sector_t sector, u64 len)
-{
-	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
-	const unsigned int du_bits = ci->ci_data_unit_bits;
-	const unsigned int du_size = 1U << du_bits;
-	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
-	const unsigned int du_per_page = 1U << du_per_page_bits;
-	u64 du_index = pos >> du_bits;
-	u64 du_remaining = len >> du_bits;
-	struct page *pages[16]; /* write up to 16 pages at a time */
-	unsigned int nr_pages;
-	unsigned int i;
-	unsigned int offset;
-	struct bio *bio;
-	int ret, err;
-
-	if (len == 0)
-		return 0;
-
-	if (fscrypt_inode_uses_inline_crypto(inode))
-		return fscrypt_zeroout_range_inline_crypt(inode, pos, sector,
-							  len);
-
-	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
-	nr_pages = min_t(u64, ARRAY_SIZE(pages),
-			 (du_remaining + du_per_page - 1) >> du_per_page_bits);
-
-	/*
-	 * We need at least one page for ciphertext.  Allocate the first one
-	 * from a mempool, with __GFP_DIRECT_RECLAIM set so that it can't fail.
-	 *
-	 * Any additional page allocations are allowed to fail, as they only
-	 * help performance, and waiting on the mempool for them could deadlock.
-	 */
-	for (i = 0; i < nr_pages; i++) {
-		pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS :
-						     GFP_NOWAIT);
-		if (!pages[i])
-			break;
-	}
-	nr_pages = i;
-	if (WARN_ON_ONCE(nr_pages <= 0))
-		return -EINVAL;
-
-	/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
-	bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
-
-	do {
-		bio->bi_iter.bi_sector = sector;
-
-		i = 0;
-		offset = 0;
-		do {
-			err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
-						      ZERO_PAGE(0), pages[i],
-						      du_size, offset);
-			if (err)
-				goto out;
-			du_index++;
-			sector += 1U << (du_bits - SECTOR_SHIFT);
-			du_remaining--;
-			offset += du_size;
-			if (offset == PAGE_SIZE || du_remaining == 0) {
-				ret = bio_add_page(bio, pages[i++], offset, 0);
-				if (WARN_ON_ONCE(ret != offset)) {
-					err = -EIO;
-					goto out;
-				}
-				offset = 0;
-			}
-		} while (i != nr_pages && du_remaining != 0);
-
-		err = submit_bio_wait(bio);
-		if (err)
-			goto out;
-		bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
-	} while (du_remaining != 0);
-	err = 0;
-out:
-	bio_put(bio);
-	for (i = 0; i < nr_pages; i++)
-		fscrypt_free_bounce_page(pages[i]);
-	return err;
-}
 EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 94dd6c89ddcd..8c4660429418 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -47,11 +47,11 @@ void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
 	queue_work(fscrypt_read_workqueue, work);
 }
 EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
-struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
+static struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
 	if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
 		/*
 		 * Oops, the filesystem called a function that uses the bounce
 		 * page pool, but it didn't set needs_bounce_pages.
@@ -63,12 +63,11 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 
 /**
  * fscrypt_free_bounce_page() - free a ciphertext bounce page
  * @bounce_page: the bounce page to free, or NULL
  *
- * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(),
- * or by fscrypt_alloc_bounce_page() directly.
+ * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks().
  */
 void fscrypt_free_bounce_page(struct page *bounce_page)
 {
 	if (!bounce_page)
 		return;
@@ -105,14 +104,15 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
 	}
 	iv->index = cpu_to_le64(index);
 }
 
 /* Encrypt or decrypt a single "data unit" of file contents. */
-int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
-			    fscrypt_direction_t rw, u64 index,
-			    struct page *src_page, struct page *dest_page,
-			    unsigned int len, unsigned int offs)
+static int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+				   fscrypt_direction_t rw, u64 index,
+				   struct page *src_page,
+				   struct page *dest_page, unsigned int len,
+				   unsigned int offs)
 {
 	struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
 	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	union fscrypt_iv iv;
 	struct scatterlist dst, src;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 57b7ae2cfafc..da9040407d4a 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -327,15 +327,10 @@ typedef enum {
 } fscrypt_direction_t;
 
 /* crypto.c */
 extern struct kmem_cache *fscrypt_inode_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
-			    fscrypt_direction_t rw, u64 index,
-			    struct page *src_page, struct page *dest_page,
-			    unsigned int len, unsigned int offs);
-struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 
 void __printf(3, 4) __cold
 fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
 
 #define fscrypt_warn(inode, fmt, ...)		\
-- 
2.54.0


^ permalink raw reply related

* [PATCH 14/16] fscrypt: Remove unused functions and workqueue
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Remove functions that are no longer used:

- fscrypt_decrypt_bio()
- fscrypt_decrypt_pagecache_blocks()
- fscrypt_inode_uses_fs_layer_crypto()
- fscrypt_inode_uses_inline_crypto()
- fscrypt_enqueue_decrypt_work()

This makes the decryption workqueue unused, so remove it too.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/bio.c         | 32 --------------------
 fs/crypto/crypto.c      | 65 -----------------------------------------
 include/linux/fscrypt.h | 47 -----------------------------
 3 files changed, 144 deletions(-)

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 58b6b13eeedd..db095258cfca 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -13,42 +13,10 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 
 #include "fscrypt_private.h"
 
-/**
- * fscrypt_decrypt_bio() - decrypt the contents of a bio
- * @bio: the bio to decrypt
- *
- * Decrypt the contents of a "read" bio following successful completion of the
- * underlying disk read.  The bio must be reading a whole number of blocks of an
- * encrypted file directly into the page cache.  If the bio is reading the
- * ciphertext into bounce pages instead of the page cache (for example, because
- * the file is also compressed, so decompression is required after decryption),
- * then this function isn't applicable.  This function may sleep, so it must be
- * called from a workqueue rather than from the bio's bi_end_io callback.
- *
- * Return: %true on success; %false on failure.  On failure, bio->bi_status is
- *	   also set to an error status.
- */
-bool fscrypt_decrypt_bio(struct bio *bio)
-{
-	struct folio_iter fi;
-
-	bio_for_each_folio_all(fi, bio) {
-		int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length,
-							   fi.offset);
-
-		if (err) {
-			bio->bi_status = errno_to_blk_status(err);
-			return false;
-		}
-	}
-	return true;
-}
-EXPORT_SYMBOL(fscrypt_decrypt_bio);
-
 struct fscrypt_zero_done {
 	atomic_t		pending;
 	blk_status_t		status;
 	struct completion	done;
 };
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 8c4660429418..27663f4d8705 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -36,21 +36,14 @@ module_param(num_prealloc_crypto_pages, uint, 0444);
 MODULE_PARM_DESC(num_prealloc_crypto_pages,
 		"Number of crypto pages to preallocate");
 
 static mempool_t *fscrypt_bounce_page_pool = NULL;
 
-static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 struct kmem_cache *fscrypt_inode_info_cachep;
 
-void fscrypt_enqueue_decrypt_work(struct work_struct *work)
-{
-	queue_work(fscrypt_read_workqueue, work);
-}
-EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
-
 static struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
 	if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
 		/*
 		 * Oops, the filesystem called a function that uses the bounce
@@ -236,54 +229,10 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				       FS_ENCRYPT, lblk_num, page, page, len,
 				       offs);
 }
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
-/**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
- * @folio: the pagecache folio containing the data to decrypt
- * @len: size of the data to decrypt, in bytes
- * @offs: offset within @folio of the data to decrypt, in bytes
- *
- * Decrypt data that has just been read from an encrypted file.  The data must
- * be located in a pagecache folio that is still locked and not yet uptodate.
- * The length and offset of the data must be aligned to the file's crypto data
- * unit size.  Alignment to the filesystem block size fulfills this requirement,
- * as the filesystem block size is always a multiple of the data unit size.
- *
- * Return: 0 on success; -errno on failure
- */
-int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
-				     size_t offs)
-{
-	const struct inode *inode = folio->mapping->host;
-	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
-	const unsigned int du_bits = ci->ci_data_unit_bits;
-	const unsigned int du_size = 1U << du_bits;
-	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
-		    (offs >> du_bits);
-	size_t i;
-	int err;
-
-	if (WARN_ON_ONCE(!folio_test_locked(folio)))
-		return -EINVAL;
-
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
-		return -EINVAL;
-
-	for (i = offs; i < offs + len; i += du_size, index++) {
-		struct page *page = folio_page(folio, i >> PAGE_SHIFT);
-
-		err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
-					      page, du_size, i & ~PAGE_MASK);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
-
 /**
  * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
  * @inode:     The inode to which this block belongs
  * @page:      The page containing the block to decrypt
  * @len:       Size of block to decrypt.  This must be a multiple of
@@ -369,24 +318,10 @@ void fscrypt_msg(const struct inode *inode, const char *level,
 	va_end(args);
 }
 
 static int __init fscrypt_init(void)
 {
-	/*
-	 * Use an unbound workqueue to allow bios to be decrypted in parallel
-	 * even when they happen to complete on the same CPU.  This sacrifices
-	 * locality, but it's worthwhile since decryption is CPU-intensive.
-	 *
-	 * Also use a high-priority workqueue to prioritize decryption work,
-	 * which blocks reads from completing, over regular application tasks.
-	 */
-	fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
-						 WQ_UNBOUND | WQ_HIGHPRI,
-						 num_online_cpus());
-	if (!fscrypt_read_workqueue)
-		panic("failed to allocate fscrypt_read_queue");
-
 	fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_PANIC);
 	fscrypt_init_keyring();
 	return 0;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 43bafdd67dd7..acf5b28eb9d7 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -341,20 +341,17 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry,
 		spin_unlock(&dentry->d_lock);
 	}
 }
 
 /* crypto.c */
-void fscrypt_enqueue_decrypt_work(struct work_struct *);
 
 struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 		size_t len, size_t offs, gfp_t gfp_flags);
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num);
 
-int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
-				     size_t offs);
 int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num);
 
 static inline bool fscrypt_is_bounce_page(struct page *page)
@@ -448,11 +445,10 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
 bool fscrypt_match_name(const struct fscrypt_name *fname,
 			const u8 *de_name, u32 de_name_len);
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
 
 /* bio.c */
-bool fscrypt_decrypt_bio(struct bio *bio);
 int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
 			  sector_t sector, u64 len);
 
 /* hooks.c */
 int fscrypt_file_open(struct inode *inode, struct file *filp);
@@ -508,13 +504,10 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry,
 					  bool is_nokey_name)
 {
 }
 
 /* crypto.c */
-static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
-{
-}
 
 static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 		size_t len, size_t offs, gfp_t gfp_flags)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -526,16 +519,10 @@ static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
 						unsigned int offs, u64 lblk_num)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio,
-						   size_t len, size_t offs)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
 						struct page *page,
 						unsigned int len,
 						unsigned int offs, u64 lblk_num)
 {
@@ -749,14 +736,10 @@ static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *nam
 {
 	return 1;
 }
 
 /* bio.c */
-static inline bool fscrypt_decrypt_bio(struct bio *bio)
-{
-	return true;
-}
 
 static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
 					sector_t sector, u64 len)
 {
 	return -EOPNOTSUPP;
@@ -890,40 +873,10 @@ static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
 {
 	return nr_blocks;
 }
 #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
-/**
- * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
- *					encryption
- * @inode: an inode. If encrypted, its key must be set up.
- *
- * Return: true if the inode requires file contents encryption and if the
- *	   encryption should be done in the block layer via blk-crypto rather
- *	   than in the filesystem layer.
- */
-static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
-{
-	return fscrypt_needs_contents_encryption(inode) &&
-	       inode->i_sb->s_cop->is_block_based;
-}
-
-/**
- * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
- *					  encryption
- * @inode: an inode. If encrypted, its key must be set up.
- *
- * Return: true if the inode requires file contents encryption and if the
- *	   encryption should be done in the filesystem layer rather than in the
- *	   block layer via blk-crypto.
- */
-static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
-{
-	return fscrypt_needs_contents_encryption(inode) &&
-	       !inode->i_sb->s_cop->is_block_based;
-}
-
 /**
  * fscrypt_has_encryption_key() - check whether an inode has had its key set up
  * @inode: the inode to check
  *
  * Return: %true if the inode has had its encryption key set up, else %false.
-- 
2.54.0


^ permalink raw reply related

* [PATCH 15/16] fscrypt: Merge bio.c and inline_crypt.c into block.c
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Now that fscrypt always uses blk-crypto on block-based filesystems,
there's no meaningful difference between bio.c and inline_crypt.c.
Therefore merge the two files into one named block.c.

Note: I didn't carry over bio.c's "Copyright (C) 2015, Motorola
Mobility", as none of the code that applied to remained.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/Makefile                    |   3 +-
 fs/crypto/bio.c                       | 100 --------------------------
 fs/crypto/{inline_crypt.c => block.c} |  96 +++++++++++++++++++++++--
 fs/crypto/fscrypt_private.h           |   2 +-
 include/linux/fscrypt.h               |  22 +++---
 5 files changed, 101 insertions(+), 122 deletions(-)
 delete mode 100644 fs/crypto/bio.c
 rename fs/crypto/{inline_crypt.c => block.c} (79%)

diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index 652c7180ec6d..b03e02f0f09d 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -8,7 +8,6 @@ fscrypto-y := crypto.o \
 	      keyring.o \
 	      keysetup.o \
 	      keysetup_v1.o \
 	      policy.o
 
-fscrypto-$(CONFIG_BLOCK) += bio.o
-fscrypto-$(CONFIG_FS_ENCRYPTION_INLINE_CRYPT) += inline_crypt.o
+fscrypto-$(CONFIG_BLOCK) += block.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
deleted file mode 100644
index db095258cfca..000000000000
--- a/fs/crypto/bio.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Utility functions for file contents encryption/decryption on
- * block device-based filesystems.
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- */
-
-#include <linux/bio.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-
-#include "fscrypt_private.h"
-
-struct fscrypt_zero_done {
-	atomic_t		pending;
-	blk_status_t		status;
-	struct completion	done;
-};
-
-static void fscrypt_zeroout_range_done(struct fscrypt_zero_done *done)
-{
-	if (atomic_dec_and_test(&done->pending))
-		complete(&done->done);
-}
-
-static void fscrypt_zeroout_range_end_io(struct bio *bio)
-{
-	struct fscrypt_zero_done *done = bio->bi_private;
-
-	if (bio->bi_status)
-		cmpxchg(&done->status, 0, bio->bi_status);
-	fscrypt_zeroout_range_done(done);
-	bio_put(bio);
-}
-
-/**
- * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
- * @inode: the file's inode
- * @pos: the first file position (in bytes) to zero out
- * @sector: the first sector to zero out
- * @len: bytes to zero out
- *
- * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write
- * ciphertext blocks which decrypt to the all-zeroes block.  The blocks must be
- * both logically and physically contiguous.  It's also assumed that the
- * filesystem only uses a single block device, ->s_bdev.  @len must be a
- * multiple of the file system logical block size.
- *
- * Note that since each block uses a different IV, this involves writing a
- * different ciphertext to each block; we can't simply reuse the same one.
- *
- * Return: 0 on success; -errno on failure.
- */
-int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
-			  sector_t sector, u64 len)
-{
-	struct fscrypt_zero_done done = {
-		.pending	= ATOMIC_INIT(1),
-		.done		= COMPLETION_INITIALIZER_ONSTACK(done.done),
-	};
-
-	if (len == 0)
-		return 0;
-
-	do {
-		struct bio *bio;
-		unsigned int n;
-
-		bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
-				GFP_NOFS);
-		bio->bi_iter.bi_sector = sector;
-		bio->bi_private = &done;
-		bio->bi_end_io = fscrypt_zeroout_range_end_io;
-		fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_NOFS);
-
-		for (n = 0; n < BIO_MAX_VECS; n++) {
-			unsigned int bytes_this_page = min(len, PAGE_SIZE);
-
-			__bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
-			len -= bytes_this_page;
-			pos += bytes_this_page;
-			sector += (bytes_this_page >> SECTOR_SHIFT);
-			if (!len || !fscrypt_mergeable_bio(bio, inode, pos))
-				break;
-		}
-
-		atomic_inc(&done.pending);
-		blk_crypto_submit_bio(bio);
-	} while (len);
-
-	fscrypt_zeroout_range_done(&done);
-
-	wait_for_completion(&done.done);
-	return blk_status_to_errno(done.status);
-}
-EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/block.c
similarity index 79%
rename from fs/crypto/inline_crypt.c
rename to fs/crypto/block.c
index 3c3a46c5af42..60e687da7760 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/block.c
@@ -1,22 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Inline encryption support for fscrypt
+ * File contents en/decryption on block-based filesystems
  *
  * Copyright 2019 Google LLC
  */
 
 /*
- * With "inline encryption", the block layer handles the decryption/encryption
- * as part of the bio, instead of the filesystem doing the crypto itself via
- * crypto API.  See Documentation/block/inline-encryption.rst.  fscrypt still
- * provides the key and IV to use.
+ * This file implements fscrypt's file contents en/decryption using blk-crypto
+ * (Documentation/block/inline-encryption.rst).  fscrypt assigns a bio_crypt_ctx
+ * with a key and IV to each bio, and the block layer does the en/decryption.
+ *
+ * This file's exported functions are called only by block-based filesystems.
  */
 
 #include <linux/blk-crypto.h>
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>
 #include <linux/export.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/uio.h>
 
@@ -338,5 +338,89 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 	dun = ci->ci_hashed_ino + lblk;
 
 	return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun);
 }
 EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks);
+
+struct fscrypt_zero_done {
+	atomic_t		pending;
+	blk_status_t		status;
+	struct completion	done;
+};
+
+static void fscrypt_zeroout_range_done(struct fscrypt_zero_done *done)
+{
+	if (atomic_dec_and_test(&done->pending))
+		complete(&done->done);
+}
+
+static void fscrypt_zeroout_range_end_io(struct bio *bio)
+{
+	struct fscrypt_zero_done *done = bio->bi_private;
+
+	if (bio->bi_status)
+		cmpxchg(&done->status, 0, bio->bi_status);
+	fscrypt_zeroout_range_done(done);
+	bio_put(bio);
+}
+
+/**
+ * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
+ * @inode: the file's inode
+ * @pos: the first file position (in bytes) to zero out
+ * @sector: the first sector to zero out
+ * @len: bytes to zero out
+ *
+ * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write
+ * ciphertext blocks which decrypt to the all-zeroes block.  The blocks must be
+ * both logically and physically contiguous.  It's also assumed that the
+ * filesystem only uses a single block device, ->s_bdev.  @len must be a
+ * multiple of the file system logical block size.
+ *
+ * Note that since each block uses a different IV, this involves writing a
+ * different ciphertext to each block; we can't simply reuse the same one.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
+int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
+			  sector_t sector, u64 len)
+{
+	struct fscrypt_zero_done done = {
+		.pending	= ATOMIC_INIT(1),
+		.done		= COMPLETION_INITIALIZER_ONSTACK(done.done),
+	};
+
+	if (len == 0)
+		return 0;
+
+	do {
+		struct bio *bio;
+		unsigned int n;
+
+		bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+				GFP_NOFS);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_private = &done;
+		bio->bi_end_io = fscrypt_zeroout_range_end_io;
+		fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_NOFS);
+
+		for (n = 0; n < BIO_MAX_VECS; n++) {
+			unsigned int bytes_this_page = min(len, PAGE_SIZE);
+
+			__bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
+			len -= bytes_this_page;
+			pos += bytes_this_page;
+			sector += (bytes_this_page >> SECTOR_SHIFT);
+			if (!len || !fscrypt_mergeable_bio(bio, inode, pos))
+				break;
+		}
+
+		atomic_inc(&done.pending);
+		blk_crypto_submit_bio(bio);
+	} while (len);
+
+	fscrypt_zeroout_range_done(&done);
+
+	wait_for_completion(&done.done);
+	return blk_status_to_errno(done.status);
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index da9040407d4a..74329e0953d1 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -393,11 +393,11 @@ void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
 
 void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
 			 const u8 *info, unsigned int infolen,
 			 u8 *okm, unsigned int okmlen);
 
-/* inline_crypt.c */
+/* block.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
 static inline bool
 fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index acf5b28eb9d7..52ff014aeae6 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -444,14 +444,10 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
 			      struct fscrypt_str *oname);
 bool fscrypt_match_name(const struct fscrypt_name *fname,
 			const u8 *de_name, u32 de_name_len);
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
 
-/* bio.c */
-int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
-			  sector_t sector, u64 len);
-
 /* hooks.c */
 int fscrypt_file_open(struct inode *inode, struct file *filp);
 int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
 			   struct dentry *dentry);
 int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -735,18 +731,10 @@ static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *nam
 				       struct dentry *dentry, unsigned int flags)
 {
 	return 1;
 }
 
-/* bio.c */
-
-static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
-					sector_t sector, u64 len)
-{
-	return -EOPNOTSUPP;
-}
-
 /* hooks.c */
 
 static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
 {
 	if (IS_ENCRYPTED(inode))
@@ -842,20 +830,22 @@ static inline void fscrypt_set_ops(struct super_block *sb,
 {
 }
 
 #endif	/* !CONFIG_FS_ENCRYPTION */
 
-/* inline_crypt.c */
+/* block.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
 
 void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       loff_t pos, gfp_t gfp_mask);
 
 bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 			   loff_t pos);
 
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
+int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
+			  sector_t sector, u64 len);
 
 #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
 static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
 					     const struct inode *inode,
@@ -871,10 +861,16 @@ static inline bool fscrypt_mergeable_bio(struct bio *bio,
 static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
 					  u64 nr_blocks)
 {
 	return nr_blocks;
 }
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos,
+					sector_t sector, u64 len)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
 /**
  * fscrypt_has_encryption_key() - check whether an inode has had its key set up
  * @inode: the inode to check
-- 
2.54.0


^ permalink raw reply related

* [PATCH 16/16] fscrypt: Add safety checks to non-block-based en/decryption
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

fscrypt_encrypt_pagecache_blocks(), fscrypt_encrypt_block_inplace(),
fscrypt_decrypt_block_inplace() would dereference a NULL
fscrypt_inode_info pointer if they were to be called on a file that
hasn't been opened yet or on a block-based filesystem.  Since they have
the ability to report errors anyway, add WARN_ON_ONCE checks for this.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/crypto/crypto.c | 61 +++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 27663f4d8705..c91eda62f9a4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -103,35 +103,44 @@ static int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
 				   fscrypt_direction_t rw, u64 index,
 				   struct page *src_page,
 				   struct page *dest_page, unsigned int len,
 				   unsigned int offs)
 {
-	struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
-	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	struct crypto_sync_skcipher *tfm;
 	union fscrypt_iv iv;
 	struct scatterlist dst, src;
 	int err;
 
+	if (WARN_ON_ONCE(ci == NULL)) /* File hasn't been opened yet? */
+		return -ENOKEY;
+	tfm = ci->ci_enc_key.tfm;
+	if (WARN_ON_ONCE(tfm == NULL)) /* Called on block-based filesystem? */
+		return -ENOKEY;
+
 	if (WARN_ON_ONCE(len <= 0))
 		return -EINVAL;
 	if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
 		return -EINVAL;
 
 	fscrypt_generate_iv(&iv, index, ci);
 
-	skcipher_request_set_callback(
-		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		NULL, NULL);
-	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, len, offs);
-	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, len, offs);
-	skcipher_request_set_crypt(req, &src, &dst, len, &iv);
-	if (rw == FS_DECRYPT)
-		err = crypto_skcipher_decrypt(req);
-	else
-		err = crypto_skcipher_encrypt(req);
+	{
+		SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+		skcipher_request_set_callback(req,
+					      CRYPTO_TFM_REQ_MAY_BACKLOG |
+						      CRYPTO_TFM_REQ_MAY_SLEEP,
+					      NULL, NULL);
+		sg_init_table(&dst, 1);
+		sg_set_page(&dst, dest_page, len, offs);
+		sg_init_table(&src, 1);
+		sg_set_page(&src, src_page, len, offs);
+		skcipher_request_set_crypt(req, &src, &dst, len, &iv);
+		if (rw == FS_DECRYPT)
+			err = crypto_skcipher_decrypt(req);
+		else
+			err = crypto_skcipher_encrypt(req);
+	}
 	if (err)
 		fscrypt_err(ci->ci_inode,
 			    "%scryption failed for data unit %llu: %d",
 			    (rw == FS_DECRYPT ? "De" : "En"), index, err);
 	return err;
@@ -151,11 +160,11 @@ static int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
  *
  * In the bounce page, the ciphertext data will be located at the same offset at
  * which the plaintext data was located in the source page.  Any other parts of
  * the bounce page will be left uninitialized.
  *
- * This is for use by the filesystem's ->writepages() method.
+ * This is for use by the ->writepages() method of non-block-based filesystems.
  *
  * The bounce page allocation is mempool-backed, so it will always succeed when
  * @gfp_flags includes __GFP_DIRECT_RECLAIM, e.g. when it's GFP_NOFS.  However,
  * only the first page of each bio can be allocated this way.  To prevent
  * deadlocks, for any additional pages a mask like GFP_NOWAIT must be used.
@@ -165,18 +174,24 @@ static int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
 struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 		size_t len, size_t offs, gfp_t gfp_flags)
 {
 	const struct inode *inode = folio->mapping->host;
 	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
-	const unsigned int du_bits = ci->ci_data_unit_bits;
-	const unsigned int du_size = 1U << du_bits;
+	unsigned int du_bits;
+	unsigned int du_size;
 	struct page *ciphertext_page;
-	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
-		    (offs >> du_bits);
+	u64 index;
 	unsigned int i;
 	int err;
 
+	if (WARN_ON_ONCE(ci == NULL)) /* File hasn't been opened yet? */
+		return ERR_PTR(-ENOKEY);
+
+	du_bits = ci->ci_data_unit_bits;
+	du_size = 1U << du_bits;
+	index = (folio_pos(folio) + offs) >> du_bits;
+
 	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 	if (WARN_ON_ONCE(!folio_test_locked(folio)))
 		return ERR_PTR(-EINVAL);
 
 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
@@ -213,11 +228,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  *
  * Encrypt a possibly-compressed filesystem block that is located in an
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
- * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ * This function only supports non-block-based filesystems that don't support
+ * sub-block data units (as indicated by the fscrypt_operations fields).
  *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
@@ -243,11 +259,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
  *
  * Decrypt a possibly-compressed filesystem block that is located in an
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
- * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ * This function only supports non-block-based filesystems that don't support
+ * sub-block data units (as indicated by the fscrypt_operations fields).
  *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
@@ -273,11 +290,11 @@ EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 int fscrypt_initialize(struct super_block *sb)
 {
 	mempool_t *pool;
 
 	/* pairs with smp_store_release() below */
-	if (likely(smp_load_acquire(&fscrypt_bounce_page_pool)))
+	if (smp_load_acquire(&fscrypt_bounce_page_pool))
 		return 0;
 
 	/* No need to allocate a bounce page pool if this FS won't use it. */
 	if (!sb->s_cop->needs_bounce_pages)
 		return 0;
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH] block, bfq: protect async queue reset with blkcg locks
From: yu kuai @ 2026-06-24  6:28 UTC (permalink / raw)
  To: Cen Zhang, Tejun Heo, Josef Bacik, Jens Axboe, Arianna Avanzini,
	Paolo Valente
  Cc: linux-block, cgroups, linux-kernel, baijiaju1990, yukuai
In-Reply-To: <20260621135930.2657810-1-zzzccc427@gmail.com>

Hi,

在 2026/6/21 21:59, Cen Zhang 写道:
> Writing 0 to BFQ's low_latency attribute ends weight raising for active,
> idle and async queues. The async cgroup path walks q->blkg_list, converts
> each blkg to BFQ policy data and then reads bfqg->async_bfqq and
> bfqg->async_idle_bfqq.
>
> That walk was protected only by bfqd->lock. blkcg release work is
> serialized by q->blkcg_mutex and q->queue_lock instead, and
> blkg_free_workfn() can call BFQ's pd_free_fn before it removes
> blkg->q_node from q->blkg_list. A low_latency reset can therefore still
> find the blkg on the queue list after the BFQ policy data has been freed.
>
> The buggy scenario involves two paths, with each column showing the order
> within that path:
>
> BFQ low_latency reset:              blkcg blkg release work:
> 1. bfq_low_latency_store()          1. blkg_free_workfn() takes
>     calls bfq_end_wr().                 q->blkcg_mutex.
> 2. bfq_end_wr_async() walks         2. BFQ pd_free_fn drops the
>     q->blkg_list.                       final bfq_group reference.
> 3. blkg_to_bfqg() returns           3. blkg->q_node remains on
>     the stale policy data.              q->blkg_list until list_del_init().
> 4. bfq_end_wr_async_queues()
>     reads async queue fields.
>
> Fix this by taking q->blkcg_mutex and q->queue_lock around the
> q->blkg_list walk, then taking bfqd->lock before touching BFQ async
> queues. The mutex serializes against policy-data free and queue_lock
> stabilizes the list. Move the async reset out of bfq_end_wr()'s existing
> bfqd->lock critical section so the lock order matches blkcg policy
> callbacks.
>
> Validation reproduced this kernel report:
> BUG: KASAN: slab-use-after-free in bfq_end_wr_async_queues+0x246/0x340
>
> Call Trace:
>   <TASK>
>   dump_stack_lvl+0x66/0xa0
>   print_report+0xce/0x630
>   ? bfq_end_wr_async_queues+0x246/0x340
>   ? srso_alias_return_thunk+0x5/0xfbef5
>   ? __virt_addr_valid+0x20d/0x410
>   ? bfq_end_wr_async_queues+0x246/0x340
>   kasan_report+0xe0/0x110
>   ? bfq_end_wr_async_queues+0x246/0x340
>   bfq_end_wr_async_queues+0x246/0x340
>   bfq_end_wr_async+0xba/0x180
>   bfq_low_latency_store+0x4e5/0x690
>   ? 0xffffffffc02150da
>   ? __pfx_bfq_low_latency_store+0x10/0x10
>   ? __pfx_bfq_low_latency_store+0x10/0x10
>   elv_attr_store+0xc4/0x110
>   kernfs_fop_write_iter+0x2f5/0x4a0
>   vfs_write+0x604/0x11f0
>   ? __pfx_locks_remove_posix+0x10/0x10
>   ? __pfx_vfs_write+0x10/0x10
>   ksys_write+0xf9/0x1d0
>   ? __pfx_ksys_write+0x10/0x10
>   do_syscall_64+0x115/0x6a0
>   entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> Allocated by task 544:
>   kasan_save_stack+0x33/0x60
>   kasan_save_track+0x14/0x30
>   __kasan_kmalloc+0xaa/0xb0
>   bfq_pd_alloc+0xc0/0x1b0
>   blkg_alloc+0x346/0x960
>   blkg_create+0x8c2/0x10d0
>   bio_associate_blkg_from_css+0x9f3/0xfa0
>   bio_associate_blkg+0xd9/0x200
>   bio_init+0x303/0x640
>   __blkdev_direct_IO_simple+0x56b/0x8a0
>   blkdev_direct_IO+0x8e7/0x2580
>   blkdev_read_iter+0x205/0x400
>   vfs_read+0x7b0/0xda0
>   ksys_read+0xf9/0x1d0
>   do_syscall_64+0x115/0x6a0
>   entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> Freed by task 465:
>   kasan_save_stack+0x33/0x60
>   kasan_save_track+0x14/0x30
>   kasan_save_free_info+0x3b/0x60
>   __kasan_slab_free+0x5f/0x80
>   kfree+0x307/0x580
>   blkg_free_workfn+0xef/0x460
>   process_one_work+0x8d0/0x1870
>   worker_thread+0x575/0xf80
>   kthread+0x2e7/0x3c0
>   ret_from_fork+0x576/0x810
>   ret_from_fork_asm+0x1a/0x30
>
> Fixes: 44e44a1b329e ("block, bfq: improve responsiveness")
> Assisted-by: Codex:gpt-5.5
> Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
> ---
>   block/bfq-cgroup.c  | 13 ++++++++++++-
>   block/bfq-iosched.c |  3 ++-
>   2 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
> index 0bd0332b3d78..d8fdace464b4 100644
> --- a/block/bfq-cgroup.c
> +++ b/block/bfq-cgroup.c
> @@ -936,14 +936,23 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
>   
>   void bfq_end_wr_async(struct bfq_data *bfqd)
>   {
> +	struct request_queue *q = bfqd->queue;
>   	struct blkcg_gq *blkg;
>   
> -	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
> +	mutex_lock(&q->blkcg_mutex);
> +	spin_lock_irq(&q->queue_lock);
> +	spin_lock(&bfqd->lock);

Just notice this patch, the same problem is already fixed by another patchset
that I posted. Since this patch is already applied by Jens, I'll rebase my patchset.

BTW, I'm also trying to get rid of queue_lock for blkg protection.

> +
> +	list_for_each_entry(blkg, &q->blkg_list, q_node) {
>   		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
>   
>   		bfq_end_wr_async_queues(bfqd, bfqg);
>   	}
>   	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
> +
> +	spin_unlock(&bfqd->lock);
> +	spin_unlock_irq(&q->queue_lock);
> +	mutex_unlock(&q->blkcg_mutex);
>   }
>   
>   static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v)
> @@ -1416,7 +1425,9 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
>   
>   void bfq_end_wr_async(struct bfq_data *bfqd)
>   {
> +	spin_lock_irq(&bfqd->lock);
>   	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
> +	spin_unlock_irq(&bfqd->lock);
>   }
>   
>   struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio)
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index 141c602d5e85..eec9be62061b 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -2653,9 +2653,10 @@ static void bfq_end_wr(struct bfq_data *bfqd)
>   	}
>   	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
>   		bfq_bfqq_end_wr(bfqq);
> -	bfq_end_wr_async(bfqd);
>   
>   	spin_unlock_irq(&bfqd->lock);
> +
> +	bfq_end_wr_async(bfqd);
>   }
>   
>   static sector_t bfq_io_struct_pos(void *io_struct, bool request)

-- 
Thanks,
Kuai

^ permalink raw reply

* [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Yu Kuai <yukuai@fnnas.com>

Add a module parameter 'fault_tolerant' that changes how md-linear
handles disk failures. When enabled:

- Disk failures are isolated instead of failing the entire array
- I/O to failed disks returns -EIO while healthy disks continue
- The array remains operational with reduced capacity
- Failed disk count is tracked and shown in /proc/mdstat

This enables unraid-like functionality where individual disk failures
don't bring down the entire array, allowing continued access to data
on healthy disks.

The fault_tolerant parameter can be set at module load time or
dynamically via /sys/module/md_linear/parameters/fault_tolerant.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md-linear.c | 63 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 8d7b82c4a723..8afc6665cfde 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -2,6 +2,10 @@
 /*
  * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
  * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ *
+ * Fault-tolerant mode added for unraid-like setups.
+ * When fault_tolerant=1, disk failures are isolated - I/O to failed disks
+ * returns -EIO while healthy disks continue operating normally.
  */

 #include <linux/blkdev.h>
@@ -21,9 +25,15 @@ struct linear_conf {
 	sector_t                array_sectors;
 	/* a copy of mddev->raid_disks */
 	int                     raid_disks;
+	atomic_t		failed_disks;	/* count of failed disks */
 	struct dev_info         disks[] __counted_by(raid_disks);
 };

+static bool fault_tolerant;
+module_param(fault_tolerant, bool, 0644);
+MODULE_PARM_DESC(fault_tolerant,
+	"Enable fault-tolerant mode: isolate disk failures instead of failing array (default: false)");
+
 /*
  * find which device holds a particular offset
  */
@@ -96,6 +106,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	if (!conf)
 		return ERR_PTR(-ENOMEM);

+	atomic_set(&conf->failed_disks, 0);
+
 	/*
 	 * conf->raid_disks is copy of mddev->raid_disks. The reason to
 	 * keep a copy of mddev->raid_disks in struct linear_conf is,
@@ -251,7 +263,8 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		     bio_sector < start_sector))
 		goto out_of_bounds;

-	if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+	if (unlikely(is_rdev_broken(tmp_dev->rdev) ||
+		     test_bit(Faulty, &tmp_dev->rdev->flags))) {
 		md_error(mddev, tmp_dev->rdev);
 		bio_io_error(bio);
 		return true;
@@ -296,16 +309,47 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)

 static void linear_status(struct seq_file *seq, struct mddev *mddev)
 {
+	struct linear_conf *conf = mddev->private;
+
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+	if (fault_tolerant) {
+		int failed = atomic_read(&conf->failed_disks);
+
+		seq_puts(seq, " fault-tolerant");
+		if (failed)
+			seq_printf(seq, " [%d failed]", failed);
+	}
 }

 static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
 {
-	if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
-		char *md_name = mdname(mddev);
-
-		pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
-			md_name, rdev->bdev);
+	char *md_name = mdname(mddev);
+
+	if (fault_tolerant) {
+		/*
+		 * Fault-tolerant mode: isolate the failed disk instead of
+		 * failing the entire array. I/O to this disk will return -EIO
+		 * but other disks continue operating normally.
+		 */
+		if (!test_and_set_bit(Faulty, &rdev->flags)) {
+			struct linear_conf *conf = mddev->private;
+
+			atomic_inc(&conf->failed_disks);
+			pr_warn("md/linear%s: Disk failure on %pg detected, isolating device (fault-tolerant mode).\n",
+				md_name, rdev->bdev);
+			pr_warn("md/linear%s: %d disk(s) now failed, array continues with reduced capacity.\n",
+				md_name, atomic_read(&conf->failed_disks));
+			/* Notify userspace about the state change */
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
+		}
+	} else {
+		/*
+		 * Standard mode: fail the entire array on any disk failure.
+		 */
+		if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+			pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+				md_name, rdev->bdev);
+		}
 	}
 }

@@ -344,7 +388,7 @@ static void linear_exit(void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD with optional fault-tolerant mode");
 MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
 MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
--
2.43.0

^ permalink raw reply related

* [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Yu Kuai <yukuai@fnnas.com>

Add support for an "unraid" mount option that enables a special mode
designed for use with fault-tolerant md-linear arrays. In this mode:

1. Variable block groups: Each block group can have a different size,
   allowing one physical disk per group. Lookup tables are used for
   block-to-group mapping instead of fixed-size calculations.

2. Distributed metadata: Every block group has its own superblock and
   group descriptor table copy, enabling the filesystem to remain
   accessible even if some disks fail.

3. Single-group allocation: Files are allocated entirely within a
   single block group. If a group doesn't have enough space, the
   allocation fails with -ENOSPC instead of trying other groups.
   This ensures each file resides on a single physical disk.

4. Inode locality: Inodes are allocated in the same group as their
   parent directory, keeping files and their metadata on the same disk.

This enables unraid-like functionality where:
- Each disk is independent and can be read separately
- Disk failures only affect files on that specific disk
- The filesystem continues operating with reduced capacity

Usage:
  mount -t ext4 -o unraid /dev/md0 /mnt

Note: This requires a specially formatted filesystem where each block
group corresponds to one physical disk. A future mkfs.ext4 extension
will support creating such filesystems.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 fs/ext4/balloc.c  | 45 ++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/ext4.h    | 15 ++++++++++++++-
 fs/ext4/ialloc.c  | 13 +++++++++++++
 fs/ext4/mballoc.c |  8 ++++++++
 fs/ext4/super.c   | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8040c731b3e4..bd151dc5480b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -54,17 +54,43 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
 {
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
 	ext4_grpblk_t offset;

 	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+
+	/* Unraid mode: binary search through variable-size groups */
+	if (sbi->s_group_first_block) {
+		ext4_group_t lo = 0, hi = sbi->s_groups_count - 1;
+		ext4_fsblk_t first_data = le32_to_cpu(es->s_first_data_block);
+
+		blocknr += first_data; /* restore original block number */
+
+		while (lo < hi) {
+			ext4_group_t mid = (lo + hi + 1) / 2;
+
+			if (blocknr < sbi->s_group_first_block[mid])
+				hi = mid - 1;
+			else
+				lo = mid;
+		}
+		if (blockgrpp)
+			*blockgrpp = lo;
+		if (offsetp) {
+			offset = (blocknr - sbi->s_group_first_block[lo]) >>
+				 sbi->s_cluster_bits;
+			*offsetp = offset;
+		}
+		return;
+	}
+
 	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
-		EXT4_SB(sb)->s_cluster_bits;
+		sbi->s_cluster_bits;
 	if (offsetp)
 		*offsetp = offset;
 	if (blockgrpp)
 		*blockgrpp = blocknr;
-
 }

 /*
@@ -162,8 +188,13 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
 static unsigned int num_clusters_in_group(struct super_block *sb,
 					  ext4_group_t block_group)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned int blocks;

+	/* Unraid mode: use per-group blocks count */
+	if (sbi->s_group_blocks_count)
+		return EXT4_NUM_B2C(sbi, sbi->s_group_blocks_count[block_group]);
+
 	if (block_group == ext4_get_groups_count(sb) - 1) {
 		/*
 		 * Even though mke2fs always initializes the first and
@@ -171,11 +202,11 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
 		 * we need to make sure we calculate the right free
 		 * blocks.
 		 */
-		blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+		blocks = ext4_blocks_count(sbi->s_es) -
 			ext4_group_first_block_no(sb, block_group);
 	} else
 		blocks = EXT4_BLOCKS_PER_GROUP(sb);
-	return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+	return EXT4_NUM_B2C(sbi, blocks);
 }

 /* Initializes an uninitialized block bitmap */
@@ -855,6 +886,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;

+	/*
+	 * Unraid mode: every group has a superblock copy for fault tolerance.
+	 * This allows mounting the filesystem even if some disks fail.
+	 */
+	if (test_opt2(sb, UNRAID))
+		return 1;
+
 	if (group == 0)
 		return 1;
 	if (ext4_has_feature_sparse_super2(sb)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56112f201cac..063e37a82654 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1295,6 +1295,9 @@ struct ext4_inode_info {
 						    * scanning in mballoc
 						    */
 #define EXT4_MOUNT2_ABORT		0x00000100 /* Abort filesystem */
+#define EXT4_MOUNT2_UNRAID		0x00000200 /* Unraid mode: one disk per
+						    * group, single-group alloc
+						    */

 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
@@ -1687,6 +1690,10 @@ struct ext4_sb_info {
 	struct flex_groups * __rcu *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;

+	/* Unraid mode: variable block groups (one disk per group) */
+	ext4_fsblk_t *s_group_first_block;	/* First block of each group */
+	ext4_grpblk_t *s_group_blocks_count;	/* Blocks count per group */
+
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;

@@ -2627,8 +2634,14 @@ struct dir_private_info {
 static inline ext4_fsblk_t
 ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/* Unraid mode: variable block groups, use lookup table */
+	if (sbi->s_group_first_block)
+		return sbi->s_group_first_block[group_no];
+
 	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+		le32_to_cpu(sbi->s_es->s_first_data_block);
 }

 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b20a1bf866ab..98fda602073e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -438,6 +438,19 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	int flex_size = ext4_flex_bg_size(sbi);
 	struct dx_hash_info hinfo;

+	/*
+	 * Unraid mode: always allocate inode in parent's group.
+	 * This ensures files and their inodes stay on the same disk.
+	 */
+	if (test_opt2(sb, UNRAID)) {
+		desc = ext4_get_group_desc(sb, parent_group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc) > 0) {
+			*group = parent_group;
+			return 0;
+		}
+		return -1; /* No free inodes in parent's group */
+	}
+
 	ngroups = real_ngroups;
 	if (flex_size > 1) {
 		ngroups = (real_ngroups + flex_size - 1) >>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 56d50fd3310b..9de674ec2f77 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2997,6 +2997,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	if (err || ac->ac_status == AC_STATUS_FOUND)
 		goto out;

+	/*
+	 * Unraid mode: files must be allocated entirely within a single group.
+	 * If the goal group doesn't have enough space, fail with -ENOSPC
+	 * instead of trying other groups.
+	 */
+	if (test_opt2(sb, UNRAID))
+		goto out;
+
 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
 		goto out;

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 87205660c5d0..9534a4ffbee7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1255,6 +1255,12 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi)
 		brelse(group_desc[i]);
 	kvfree(group_desc);
 	rcu_read_unlock();
+
+	/* Free unraid mode arrays */
+	kvfree(sbi->s_group_first_block);
+	kvfree(sbi->s_group_blocks_count);
+	sbi->s_group_first_block = NULL;
+	sbi->s_group_blocks_count = NULL;
 }

 static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
@@ -1677,6 +1683,7 @@ enum {
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
 	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
+	Opt_unraid,
 #ifdef CONFIG_EXT4_DEBUG
 	Opt_fc_debug_max_replay, Opt_fc_debug_force
 #endif
@@ -1819,6 +1826,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
 	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("noreservation",	Opt_removed),	/* mount option from ext2/3 */
 	fsparam_u32	("journal",		Opt_removed),	/* mount option from ext2/3 */
+	fsparam_flag	("unraid",		Opt_unraid),
 	{}
 };

@@ -1912,6 +1920,7 @@ static const struct mount_opts {
 	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
 #endif
 	{Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
+	{Opt_unraid, EXT4_MOUNT2_UNRAID, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
 	{Opt_err, 0, 0}
 };

@@ -4845,6 +4854,65 @@ static int ext4_check_geometry(struct super_block *sb,
 	return 0;
 }

+/*
+ * Initialize unraid mode data structures.
+ * In unraid mode, each block group can have a different size (one disk per group).
+ * This function allocates and populates the lookup tables for variable-size groups.
+ *
+ * For now, this uses the standard fixed-size groups from the superblock.
+ * A future mkfs extension will store per-group sizes in the group descriptors.
+ */
+static int ext4_unraid_init(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_fsblk_t first_data_block;
+	ext4_group_t i;
+
+	if (!test_opt2(sb, UNRAID))
+		return 0;
+
+	sbi->s_group_first_block = kvmalloc_array(ngroups,
+						  sizeof(ext4_fsblk_t),
+						  GFP_KERNEL);
+	if (!sbi->s_group_first_block)
+		return -ENOMEM;
+
+	sbi->s_group_blocks_count = kvmalloc_array(ngroups,
+						   sizeof(ext4_grpblk_t),
+						   GFP_KERNEL);
+	if (!sbi->s_group_blocks_count) {
+		kvfree(sbi->s_group_first_block);
+		sbi->s_group_first_block = NULL;
+		return -ENOMEM;
+	}
+
+	/*
+	 * Initialize with standard fixed-size groups for now.
+	 * TODO: Read per-group sizes from extended group descriptors
+	 * when mkfs supports creating variable-size groups.
+	 */
+	first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	for (i = 0; i < ngroups; i++) {
+		sbi->s_group_first_block[i] = first_data_block +
+			(ext4_fsblk_t)i * EXT4_BLOCKS_PER_GROUP(sb);
+
+		if (i == ngroups - 1) {
+			/* Last group may be smaller */
+			sbi->s_group_blocks_count[i] =
+				ext4_blocks_count(sbi->s_es) -
+				sbi->s_group_first_block[i];
+		} else {
+			sbi->s_group_blocks_count[i] = EXT4_BLOCKS_PER_GROUP(sb);
+		}
+	}
+
+	ext4_msg(sb, KERN_INFO, "unraid mode enabled: %u groups",
+		 ngroups);
+
+	return 0;
+}
+
 static int ext4_group_desc_init(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t logical_sb_block,
@@ -4904,7 +4972,8 @@ static int ext4_group_desc_init(struct super_block *sb,
 		return -EFSCORRUPTED;
 	}

-	return 0;
+	/* Initialize unraid mode data structures if enabled */
+	return ext4_unraid_init(sb);
 }

 static int ext4_load_and_init_journal(struct super_block *sb,
--
2.43.0

^ permalink raw reply related

* [PATCH v2 0/4] blk-cgroup: fix blkg list and policy data races
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Yu Kuai <yukuai@fygo.io>

Hi,

This series fixes races around q->blkg_list and blkg policy data
lifetime.

Patch 1 protects blkg_destroy_all()'s q->blkg_list walk with
blkcg_mutex.

Patches 2-3 fix races between blkcg_activate_policy() and concurrent
blkg destruction.

Patch 4 factors the policy data teardown loop into a helper after the
race fixes.

Changes since v1:
- Drop the BFQ q->blkg_list patch because the current block tree already
  has a stronger fix in commit 17b2d950a3c0 ("block, bfq: protect async
  queue reset with blkcg locks").
- Add Reviewed-by tags from Tang Yizhou.

Yu Kuai (1):
  blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with
    blkcg_mutex

Zheng Qixing (3):
  blk-cgroup: fix race between policy activation and blkg destruction
  blk-cgroup: skip dying blkg in blkcg_activate_policy()
  blk-cgroup: factor policy pd teardown loop into helper

 block/blk-cgroup.c | 65 +++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

-- 
2.51.0

^ permalink raw reply

* [PATCH v2 1/4] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Yu Kuai <yukuai@fygo.io>

blkg_destroy_all() iterates q->blkg_list without holding blkcg_mutex,
which can race with blkg_free_workfn() that removes blkgs from the list
while holding blkcg_mutex.

Add blkcg_mutex protection around the q->blkg_list iteration to prevent
potential list corruption or use-after-free issues.

Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ee076ab795d3..7baccfb690fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -574,10 +574,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 	struct blkcg_gq *blkg;
 	int count = BLKG_DESTROY_BATCH_SIZE;
 	int i;
 
 restart:
+	mutex_lock(&q->blkcg_mutex);
 	spin_lock_irq(&q->queue_lock);
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
 		if (hlist_unhashed(&blkg->blkcg_node))
@@ -592,10 +593,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 		 * it when a batch of blkgs are destroyed.
 		 */
 		if (!(--count)) {
 			count = BLKG_DESTROY_BATCH_SIZE;
 			spin_unlock_irq(&q->queue_lock);
+			mutex_unlock(&q->blkcg_mutex);
 			cond_resched();
 			goto restart;
 		}
 	}
 
@@ -611,10 +613,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 			__clear_bit(pol->plid, q->blkcg_pols);
 	}
 
 	q->root_blkg = NULL;
 	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 
 	wake_up_var(&q->root_blkg);
 }
 
 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-- 
2.51.0


^ permalink raw reply related

* [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Zheng Qixing <zhengqixing@huawei.com>

When switching an IO scheduler on a block device, blkcg_activate_policy()
allocates blkg_policy_data (pd) for all blkgs attached to the queue.
However, blkcg_activate_policy() may race with concurrent blkcg deletion,
leading to use-after-free and memory leak issues.

The use-after-free occurs in the following race:

T1 (blkcg_activate_policy):
  - Successfully allocates pd for blkg1 (loop0->queue, blkcgA)
  - Fails to allocate pd for blkg2 (loop0->queue, blkcgB)
  - Enters the enomem rollback path to release blkg1 resources

T2 (blkcg deletion):
  - blkcgA is deleted concurrently
  - blkg1 is freed via blkg_free_workfn()
  - blkg1->pd is freed

T1 (continued):
  - Rollback path accesses blkg1->pd->online after pd is freed
  - Triggers use-after-free

In addition, blkg_free_workfn() frees pd before removing the blkg from
q->blkg_list. This allows blkcg_activate_policy() to allocate a new pd
for a blkg that is being destroyed, leaving the newly allocated pd
unreachable when the blkg is finally freed.

Fix these races by extending blkcg_mutex coverage to serialize
blkcg_activate_policy() rollback and blkg destruction, ensuring pd
lifecycle is synchronized with blkg list visibility.

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7baccfb690fe..f7e788a7fe95 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
 		return -EINVAL;
 
 	if (queue_is_mq(q))
 		memflags = blk_mq_freeze_queue(q);
+
+	mutex_lock(&q->blkcg_mutex);
 retry:
 	spin_lock_irq(&q->queue_lock);
 
 	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
@@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
 
 	spin_unlock_irq(&q->queue_lock);
 out:
+	mutex_unlock(&q->blkcg_mutex);
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
 	if (pinned_blkg)
 		blkg_put(pinned_blkg);
 	if (pd_prealloc)
-- 
2.51.0


^ permalink raw reply related

* [PATCH v2 3/4] blk-cgroup: skip dying blkg in blkcg_activate_policy()
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Zheng Qixing <zhengqixing@huawei.com>

When switching IO schedulers on a block device, blkcg_activate_policy()
can race with concurrent blkcg deletion, leading to a use-after-free in
rcu_accelerate_cbs.

T1:                               T2:
                                  blkg_destroy
                                  kill(&blkg->refcnt) // blkg->refcnt=1->0
                                  blkg_release // call_rcu(__blkg_release)
                                  ...
                                  blkg_free_workfn
                                  ->pd_free_fn(pd)
elv_iosched_store
elevator_switch
...
iterate blkg list
blkg_get(blkg) // blkg->refcnt=0->1
                                  list_del_init(&blkg->q_node)
blkg_put(pinned_blkg) // blkg->refcnt=1->0
blkg_release // call_rcu again
rcu_accelerate_cbs // uaf

Fix this by checking hlist_unhashed(&blkg->blkcg_node) before getting
a reference to the blkg. This is the same check used in blkg_destroy()
to detect if a blkg has already been destroyed. If the blkg is already
unhashed, skip processing it since it's being destroyed.

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f7e788a7fe95..2538d8105e6c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1574,10 +1574,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
 
 		if (blkg->pd[pol->plid])
 			continue;
+		if (hlist_unhashed(&blkg->blkcg_node))
+			continue;
 
 		/* If prealloc matches, use it; otherwise try GFP_NOWAIT */
 		if (blkg == pinned_blkg) {
 			pd = pd_prealloc;
 			pd_prealloc = NULL;
-- 
2.51.0


^ permalink raw reply related

* [PATCH v2 4/4] blk-cgroup: factor policy pd teardown loop into helper
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

From: Zheng Qixing <zhengqixing@huawei.com>

Move the teardown sequence which offlines and frees per-policy
blkg_policy_data (pd) into a helper for readability.

No functional change intended.

Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 57 ++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2538d8105e6c..e5e95be4fbc0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1526,10 +1526,35 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
+/*
+ * Tear down per-blkg policy data for @pol on @q.
+ */
+static void blkcg_policy_teardown_pds(struct request_queue *q,
+				      const struct blkcg_policy *pol)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		struct blkcg *blkcg = blkg->blkcg;
+		struct blkg_policy_data *pd;
+
+		spin_lock(&blkcg->lock);
+		pd = blkg->pd[pol->plid];
+		if (pd) {
+			if (pd->online && pol->pd_offline_fn)
+				pol->pd_offline_fn(pd);
+			pd->online = false;
+			pol->pd_free_fn(pd);
+			blkg->pd[pol->plid] = NULL;
+		}
+		spin_unlock(&blkcg->lock);
+	}
+}
+
 /**
  * blkcg_activate_policy - activate a blkcg policy on a gendisk
  * @disk: gendisk of interest
  * @pol: blkcg policy to activate
  *
@@ -1641,25 +1666,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	return ret;
 
 enomem:
 	/* alloc failed, take down everything */
 	spin_lock_irq(&q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-		struct blkg_policy_data *pd;
-
-		spin_lock(&blkcg->lock);
-		pd = blkg->pd[pol->plid];
-		if (pd) {
-			if (pd->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(pd);
-			pd->online = false;
-			pol->pd_free_fn(pd);
-			blkg->pd[pol->plid] = NULL;
-		}
-		spin_unlock(&blkcg->lock);
-	}
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	ret = -ENOMEM;
 	goto out;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1674,11 +1685,10 @@ EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  */
 void blkcg_deactivate_policy(struct gendisk *disk,
 			     const struct blkcg_policy *pol)
 {
 	struct request_queue *q = disk->queue;
-	struct blkcg_gq *blkg;
 	unsigned int memflags;
 
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
@@ -1687,24 +1697,11 @@ void blkcg_deactivate_policy(struct gendisk *disk,
 
 	mutex_lock(&q->blkcg_mutex);
 	spin_lock_irq(&q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
-
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-
-		spin_lock(&blkcg->lock);
-		if (blkg->pd[pol->plid]) {
-			if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(blkg->pd[pol->plid]);
-			pol->pd_free_fn(blkg->pd[pol->plid]);
-			blkg->pd[pol->plid] = NULL;
-		}
-		spin_unlock(&blkcg->lock);
-	}
-
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups
From: yu kuai @ 2026-06-24  6:55 UTC (permalink / raw)
  To: Yu Kuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel, yukuai
In-Reply-To: <20260624064625.1743650-1-yukuai@kernel.org>

Hi,

Please ignore this patch, this patch is supposed only used downstream.
Ai somehow generate the cmd to send it together with the patchset:

blk-cgroup: fix blkg list and policy data races

Same for the other ext4 patch.

Sorry for the noise. :(

在 2026/6/24 14:46, Yu Kuai 写道:
> From: Yu Kuai<yukuai@fnnas.com>
>
> Add a module parameter 'fault_tolerant' that changes how md-linear
> handles disk failures. When enabled:
>
> - Disk failures are isolated instead of failing the entire array
> - I/O to failed disks returns -EIO while healthy disks continue
> - The array remains operational with reduced capacity
> - Failed disk count is tracked and shown in /proc/mdstat
>
> This enables unraid-like functionality where individual disk failures
> don't bring down the entire array, allowing continued access to data
> on healthy disks.
>
> The fault_tolerant parameter can be set at module load time or
> dynamically via /sys/module/md_linear/parameters/fault_tolerant.
>
> Signed-off-by: Yu Kuai<yukuai@fnnas.com>
> ---
>   drivers/md/md-linear.c | 63 ++++++++++++++++++++++++++++++++++++------
>   1 file changed, 55 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c

-- 
Thanks,
Kuai

^ permalink raw reply

* Re: [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths
From: yu kuai @ 2026-06-24  6:57 UTC (permalink / raw)
  To: Yu Kuai, nilay, tom.leiming, bvanassche, tj, josef, axboe
  Cc: akpm, chrisl, kasong, shikemeng, nphamcs, bhe, baohua,
	youngjun.park, cgroups, linux-block, linux-kernel, linux-mm,
	yukuai
In-Reply-To: <cover.1780621988.git.yukuai@fygo.io>

Friendly ping ...

This set can still be applied cleanly for block-7.2 branch.

在 2026/6/8 11:42, Yu Kuai 写道:
> From: Yu Kuai <yukuai@fygo.io>
>
> Hi,
>
> This series is the follow-up blk-cgroup locking cleanup on top of the
> earlier blkg-list protection fixes, and prepares blk-cgroup to stop using
> q->queue_lock as the global blkg lifetime/iteration lock.
>
> The current queue_lock based protection is hard to maintain because
> queue_lock is used from hardirq and softirq completion paths, while some
> blkcg cgroup file paths also need to iterate blkgs, print policy data, or
> create blkgs from RCU-protected contexts.  This series first tightens the
> blkcg-side lifetime rules:
>
> - blkcg_print_stat() iterates blkgs under blkcg->lock with IRQs disabled.
> - policy data freeing is delayed past an RCU grace period.
> - blkcg_print_blkgs(), blkg lookup/create, bio association, page-IO
>    association, blkg destruction, and BFQ initialization stop nesting
>    queue_lock under RCU or blkcg->lock.
>
> Using blkcg->lock and RCU for blkcg-owned lists/data keeps the lock order
> local to blk-cgroup and avoids extending queue_lock into cgroup file
> iteration paths.  It also makes the subsequent conversion to q->blkcg_mutex
> possible without carrying forward queue_lock's interrupt-context
> constraints.
>
> Yu Kuai (8):
>    blk-cgroup: protect iterating blkgs with blkcg->lock in
>      blkcg_print_stat()
>    blk-cgroup: delay freeing policy data after rcu grace period
>    blk-cgroup: don't nest queue_lock under rcu in blkcg_print_blkgs()
>    blk-cgroup: don't nest queue_lock under rcu in blkg_lookup_create()
>    blk-cgroup: don't nest queue_lock under rcu in bio_associate_blkg()
>    blk-cgroup: don't nest queue_lock under blkcg->lock in
>      blkcg_destroy_blkgs()
>    mm/page_io: don't nest queue_lock under rcu in
>      bio_associate_blkg_from_page()
>    block, bfq: don't grab queue_lock to initialize bfq
>
>   block/bfq-cgroup.c        |  17 ++++-
>   block/bfq-iosched.c       |   5 --
>   block/blk-cgroup-rwstat.c |  15 ++--
>   block/blk-cgroup.c        | 151 ++++++++++++++++++++++----------------
>   block/blk-cgroup.h        |   8 +-
>   block/blk-iocost.c        |  22 ++++--
>   block/blk-iolatency.c     |  10 ++-
>   block/blk-throttle.c      |  13 +++-
>   mm/page_io.c              |   7 +-
>   9 files changed, 158 insertions(+), 90 deletions(-)
>
>
> base-commit: b23df513de562739af61fa61ba80ef5e8059a636

-- 
Thanks,
Kuai

^ permalink raw reply

* [PATCH] null_blk: cancel bw_timer on add-device error unwind
From: Cen Zhang @ 2026-06-24  7:18 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch, Johannes Thumshirn, Chaitanya Kulkarni,
	Damien Le Moal, Genjian Zhang, Hans Holmberg, Nilay Shroff,
	Kees Cook, Matthew Wilcox, Christophe JAILLET, Kyungchan Koh,
	Shaohua Li
  Cc: linux-block, linux-kernel, baijiaju1990, zzzccc427

null_blk starts the bandwidth hrtimer before the later add_disk/device_add
failure points. If setup fails after the timer is queued, the shared error
unwind frees struct nullb without draining bw_timer, so the callback can
run on freed owner state.

The buggy scenario involves two paths, with each column showing the order
within that path:

null_add_dev() error unwind:        nullb_bwtimer_fn() callback path:
1. Start bw_timer for a throttled   1. The hrtimer expires after the free.
   device.                          2. nullb_bwtimer_fn() recovers the
2. Hit a later add_disk/device_add     embedded owner.
   failure.                         3. The callback reads nullb->dev and
3. Free struct nullb.                  nullb->q.
4. Release the remaining queue and   4. The stale owner storage is used
   disk resources.                     after free.

Cancel bw_timer in the shared error unwind before put_disk() and the
remaining frees. The normal delete path already uses the same
hrtimer_cancel() drain.

Validation reproduced this kernel report:
BUG: KASAN: slab-use-after-free in nullb_bwtimer_fn+0x13f/0x170 [null_blk]

Call Trace:
<IRQ>
 dump_stack_lvl+0x66/0xa0
 print_report+0xce/0x630
 ? nullb_bwtimer_fn+0x13f/0x170 [null_blk]
 ? srso_alias_return_thunk+0x5/0xfbef5
 ? __virt_addr_valid+0x20d/0x410
 ? nullb_bwtimer_fn+0x13f/0x170 [null_blk]
 kasan_report+0xe0/0x110
 ? nullb_bwtimer_fn+0x13f/0x170 [null_blk]
 ? __pfx_nullb_bwtimer_fn+0x10/0x10 [null_blk]
 nullb_bwtimer_fn+0x13f/0x170 [null_blk]
 __hrtimer_run_queues+0x172/0x810
 hrtimer_interrupt+0x377/0x7f0
 __sysvec_apic_timer_interrupt+0xc3/0x390
 sysvec_apic_timer_interrupt+0x67/0x80
</IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x1a/0x20

Allocated by task 529:
 kasan_save_stack+0x33/0x60
 kasan_save_track+0x14/0x30
 __kasan_kmalloc+0xaa/0xb0
 null_add_dev+0x4f9/0x1d10 [null_blk]
 nullb_device_power_store+0x25f/0x320 [null_blk]
 configfs_write_iter+0x2be/0x4a0
 vfs_write+0x604/0x11f0
 ksys_write+0xf9/0x1d0
 do_syscall_64+0x115/0x6a0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Freed by task 529:
 kasan_save_stack+0x33/0x60
 kasan_save_track+0x14/0x30
 kasan_save_free_info+0x3b/0x60
 __kasan_slab_free+0x5f/0x80
 kfree+0x307/0x580
 null_add_dev+0x1272/0x1d10 [null_blk]
 nullb_device_power_store+0x25f/0x320 [null_blk]
 configfs_write_iter+0x2be/0x4a0
 vfs_write+0x604/0x11f0
 ksys_write+0xf9/0x1d0
 do_syscall_64+0x115/0x6a0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: eff2c4f10873 ("nullb: bandwidth control")
Assisted-by: Codex:gpt-5.5
Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
---
 drivers/block/null_blk/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index f8c0fd57e041..8f1ad76710a0 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -2062,6 +2062,8 @@ static int null_add_dev(struct nullb_device *dev)
 out_ida_free:
 	ida_free(&nullb_indexes, nullb->index);
 out_cleanup_disk:
+	if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags))
+		hrtimer_cancel(&nullb->bw_timer);
 	put_disk(nullb->disk);
 out_cleanup_zone:
 	null_free_zoned_dev(dev);
-- 
2.43.0

^ permalink raw reply related

* [PATCH blktests] README.md, check: require getconf
From: Shin'ichiro Kawasaki @ 2026-06-24  7:18 UTC (permalink / raw)
  To: linux-block
  Cc: Omar Sandoval, Chaitanya Kulkarni, Bart Van Assche, Jeff Moyer,
	Shin'ichiro Kawasaki

Some test cases use the getconf command to query the page size that the
kernel supports (e.g. scsi/011, throtl/{002,003,007}, zbd/{010,014}).
Add getconf to the list of required commands so its absence is reported
clearly, and document it as a dependency in README.md.

Link: https://lore.kernel.org/linux-block/ajm1x0koQ4BftBOc@shinmob/
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
---
 README.md | 1 +
 check     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b137a43..b62540a 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ The dependencies are minimal, but make sure you have them installed:
 - fio
 - gcc
 - make
+- getconf
 - systemd-udev (udevadm)
 
 Some tests require the following:
diff --git a/check b/check
index a68049b..bc2dde9 100755
--- a/check
+++ b/check
@@ -1118,6 +1118,7 @@ _check_dependencies() {
 	required_commands_and_packages[blockdev]="util-linux"
 	required_commands_and_packages[fio]="fio"
 	required_commands_and_packages[udevadm]="systemd-udev"
+	required_commands_and_packages[getconf]="glibc-common or libc-bin"
 
 	for cmd in "${!required_commands_and_packages[@]}"; do
 		command -v "$cmd" &> /dev/null && continue
-- 
2.54.0


^ permalink raw reply related

* [PATCH] loop: serialize backing file swaps with sysfs readers
From: Cen Zhang @ 2026-06-24  7:18 UTC (permalink / raw)
  To: Jens Axboe, Kay Sievers
  Cc: linux-block, linux-kernel, baijiaju1990, zzzccc427

The backing_file sysfs attribute formats lo->lo_backing_file while holding
lo_lock, but LOOP_CHANGE_FD replaced lo_backing_file without that lock.
The old file can then be fput() after the swap, and that fput may be the
last reference. This leaves a sysfs reader that observed the old pointer
able to run file_path() on a file whose final put is underway.

Validation reproduced this kernel report:
BUG: KCSAN: data-race in lo_ioctl / loop_attr_do_show_backing_file

The buggy scenario involves two paths, with each column showing the order
within that path:

sysfs backing_file show:            LOOP_CHANGE_FD:
1. Take lo_lock.                    1. Save old_file from lo_backing_file.
2. Read lo_backing_file.            2. Store the replacement file pointer.
3. Pass it to file_path().          3. Drop the loop-owned old_file ref.

Serialize loop_assign_backing_file()'s pointer store with lo_lock, the
same lock used by the sysfs show path and by __loop_clr_fd(). This keeps a
sysfs reader that entered before the swap ordered before the old file can
be detached and fput(), and makes readers entering after the swap see the
new file.

Validation reproduced this kernel report:
[   56.673265] BUG: KCSAN: data-race in lo_ioctl / loop_attr_do_show_backing_file
[   56.674430] write to 0xffff888101d21060 of 8 bytes by task 498 on cpu 1:
[   56.675365]  lo_ioctl+0x99d/0xca0
[   56.675819]  blkdev_ioctl+0x2bc/0x380
[   56.676331]  __x64_sys_ioctl+0xc7/0x110
[   56.676846]  x64_sys_call+0x1092/0x1fb0
[   56.677372]  do_syscall_64+0x100/0x570
[   56.677878]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   56.678777] read to 0xffff888101d21060 of 8 bytes by task 489 on cpu 2:
[   56.679617]  loop_attr_do_show_backing_file+0x51/0xe0
[   56.680280]  dev_attr_show+0x3b/0x90
[   56.680769]  sysfs_kf_seq_show+0x139/0x1e0
[   56.681321]  kernfs_seq_show+0x9c/0xb0
[   56.681823]  seq_read_iter+0x2b3/0x830
[   56.682336]  kernfs_fop_read_iter+0x26b/0x2d0
[   56.682917]  vfs_read+0x414/0x5c0
[   56.683393]  ksys_read+0xa3/0x130
[   56.683844]  __x64_sys_read+0x41/0x50
[   56.684344]  x64_sys_call+0x1efb/0x1fb0
[   56.684856]  do_syscall_64+0x100/0x570
[   56.685364]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   56.686252] value changed: 0xffff888104d62900 -> 0xffff888104d53680
[   56.687275] Reported by Kernel Concurrency Sanitizer on:
[   56.687963] CPU: 2 UID: 0 PID: 489 Comm: loop_changefd_r Not tainted 7.1.0-02794-g5c7804e3279c #1 PREEMPT(lazy)
[   56.689251] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   56.690673] ==================================================================
[   62.334003] ==================================================================
[   62.334986] BUG: KCSAN: data-race in lo_ioctl / loop_attr_do_show_backing_file
[   62.336145] write to 0xffff888101d21060 of 8 bytes by task 498 on cpu 3:
[   62.337000]  lo_ioctl+0x99d/0xca0
[   62.337452]  blkdev_ioctl+0x2bc/0x380
[   62.337955]  __x64_sys_ioctl+0xc7/0x110
[   62.338468]  x64_sys_call+0x1092/0x1fb0
[   62.338993]  do_syscall_64+0x100/0x570
[   62.339493]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   62.340381] read to 0xffff888101d21060 of 8 bytes by task 495 on cpu 0:
[   62.341235]  loop_attr_do_show_backing_file+0x51/0xe0
[   62.341900]  dev_attr_show+0x3b/0x90
[   62.342385]  sysfs_kf_seq_show+0x139/0x1e0
[   62.342943]  kernfs_seq_show+0x9c/0xb0
[   62.343447]  seq_read_iter+0x2b3/0x830
[   62.343955]  kernfs_fop_read_iter+0x26b/0x2d0
[   62.344537]  vfs_read+0x414/0x5c0
[   62.344988]  ksys_read+0xa3/0x130
[   62.345438]  __x64_sys_read+0x41/0x50
[   62.345937]  x64_sys_call+0x1efb/0x1fb0
[   62.346446]  do_syscall_64+0x100/0x570
[   62.346956]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   62.347832] value changed: 0xffff888104d50f00 -> 0xffff888104cf4f00
[   62.348871] Reported by Kernel Concurrency Sanitizer on:
[   62.349548] CPU: 0 UID: 0 PID: 495 Comm: loop_changefd_r Not tainted 7.1.0-02794-g5c7804e3279c #1 PREEMPT(lazy)
[   62.350823] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   62.352260] ==================================================================
[   65.676721] ==================================================================
[   65.677703] BUG: KCSAN: data-race in lo_ioctl / loop_attr_do_show_backing_file
[   65.678870] write to 0xffff888101d21060 of 8 bytes by task 498 on cpu 0:
[   65.679712]  lo_ioctl+0x99d/0xca0
[   65.680166]  blkdev_ioctl+0x2bc/0x380
[   65.680673]  __x64_sys_ioctl+0xc7/0x110
[   65.681187]  x64_sys_call+0x1092/0x1fb0
[   65.681707]  do_syscall_64+0x100/0x570
[   65.682214]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   65.683113] read to 0xffff888101d21060 of 8 bytes by task 497 on cpu 2:
[   65.683955]  loop_attr_do_show_backing_file+0x51/0xe0
[   65.684617]  dev_attr_show+0x3b/0x90
[   65.685109]  sysfs_kf_seq_show+0x139/0x1e0
[   65.685663]  kernfs_seq_show+0x9c/0xb0
[   65.686165]  seq_read_iter+0x2b3/0x830
[   65.686679]  kernfs_fop_read_iter+0x26b/0x2d0
[   65.687265]  vfs_read+0x414/0x5c0
[   65.687720]  ksys_read+0xa3/0x130
[   65.688171]  __x64_sys_read+0x41/0x50
[   65.688665]  x64_sys_call+0x1efb/0x1fb0
[   65.689177]  do_syscall_64+0x100/0x570
[   65.689688]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   65.690582] value changed: 0xffff888100c52600 -> 0xffff888104d54180
[   65.691615] Reported by Kernel Concurrency Sanitizer on:
[   65.692309] CPU: 2 UID: 0 PID: 497 Comm: loop_changefd_r Not tainted 7.1.0-02794-g5c7804e3279c #1 PREEMPT(lazy)
[   65.693596] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   65.695035] ==================================================================
[   68.697953] ==================================================================
[   68.698927] BUG: KCSAN: data-race in lo_ioctl / loop_attr_do_show_backing_file
[   68.700101] write to 0xffff888101d21060 of 8 bytes by task 498 on cpu 1:

Fixes: 05eb0f252b04 ("loop: fix deadlock when sysfs and LOOP_CLR_FD race against each other")
Assisted-by: Codex:gpt-5.5
Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
---
 drivers/block/loop.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 310de0463beb..45937741fcb6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -503,11 +503,18 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
 
 static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
 {
+	/*
+	 * Serialize the pointer update with sysfs backing_file show, which
+	 * formats the file path under lo_lock without taking a file reference.
+	 */
+	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = file;
+	spin_unlock_irq(&lo->lo_lock);
+
 	lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
 	mapping_set_gfp_mask(file->f_mapping,
 			lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
-	if (lo->lo_backing_file->f_flags & O_DIRECT)
+	if (file->f_flags & O_DIRECT)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 	lo->lo_min_dio_size = loop_query_min_dio_size(lo);
 }
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCHv2 6/6] block: validate user space vectors during extraction
From: Christoph Hellwig @ 2026-06-24  7:39 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-block, linux-fsdevel,
	dm-devel, axboe, brauner, djwong, viro, stable
In-Reply-To: <ajqxoeZ0R_RwqEKe@kbusch-mbp>

On Tue, Jun 23, 2026 at 10:17:37AM -0600, Keith Busch wrote:
> Exactly, the in-kernel users of ITER_BVEC that allocate their own
> buffers are, as far as I know, aligned already. Fabric storage targets
> like nvme allocate their own SGLs on page boundaries so the bio is
> aligned at the point it was constructed.
> 
> The ones that forward user buffers like loop and zloop are addressed in
> the previous two patches. They generally should have been fine for most
> hardware without those updates, but they're included in case a backing
> device has more restrictive constraints than 512b "sector_t" aligned.
> 
> The only other user space provided alignment that I think may trip this
> up is the io_uring registered buffer, so that's what I'm trying to call
> out here.

Sounds reasonable, but it would be really helpful to have this in
the API documentation somewhere..

Talking about documented APIs and related bits:  do you still plan
to get back to exposing our pre-vector alignment requirements and
add tests to blktests/xfstests based on that?

^ permalink raw reply

* PI fixes v2
From: Christoph Hellwig @ 2026-06-24  8:00 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Caleb Sander Mateos, Martin K. Petersen, linux-block

Hi all,

this series has two unrelated PI/metadata fixes that came up
during a little testing surge.

Changes since v1:
 - take operator precedence into account so that zeroing doesn't disable
   other GFP_ flags.
 - add a commit log blurb on why Zone Append does not require remapping

Diffstat:
 block/bio-integrity-auto.c    |    2 +-
 block/bio-integrity-fs.c      |    4 ++--
 block/bio-integrity.c         |    9 ++++-----
 include/linux/bio-integrity.h |    2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox