* [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
@ 2025-11-20 23:54 Jaegeuk Kim
2025-11-20 23:54 ` [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission Jaegeuk Kim
` (3 more replies)
0 siblings, 4 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2025-11-20 23:54 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim
This patch enables large folio for limited case where we can get the high-order
memory allocation. It supports the encrypted and fsverity files, which are
essential for Android environment.
How to test:
- dd if=/dev/zero of=/mnt/test/test bs=1G count=4
- f2fs_io setflags immutable /mnt/test/test
- echo 3 > /proc/sys/vm/drop_caches
: to reload inode with large folio
- f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
fs/f2fs/data.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/f2fs/f2fs.h | 16 ++++
fs/f2fs/inode.c | 6 +-
3 files changed, 257 insertions(+), 10 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 48c20386f031..8f433677c49d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio->bi_status = BLK_STS_IOERR;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2341,6 +2376,177 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map;
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2366,9 +2572,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2459,8 +2669,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -3747,7 +3956,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3756,7 +3970,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -4162,12 +4376,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index dffe8958b580..3340db04a7c2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4916,6 +4916,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e2405b79b3cc..9162154d5211 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(inode))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
--
2.52.0.487.g5c8c507ade-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission
2025-11-20 23:54 [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Jaegeuk Kim
@ 2025-11-20 23:54 ` Jaegeuk Kim
2025-11-21 10:23 ` [f2fs-dev] " Chao Yu
2025-11-21 10:20 ` [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Chao Yu
` (2 subsequent siblings)
3 siblings, 1 reply; 20+ messages in thread
From: Jaegeuk Kim @ 2025-11-20 23:54 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel; +Cc: Jaegeuk Kim
For example,
1327.539878: f2fs_preload_pages_start: dev = (252,16), ino = 14, i_size = 4294967296 start: 0, end: 8191
1327.539878: page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=8192 order=9 size=0 async_size=0 ra_pages=4096 mmap_miss=0 prev_pos=-1
1327.539879: page_cache_ra_order: dev=252:16 ino=e index=0 order=9 size=4096 async_size=2048 ra_pages=4096
1327.541895: f2fs_readpages: dev = (252,16), ino = 14, start = 0 nrpage = 4096
1327.541930: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 0, type = Read
1327.541931: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 0, read_ext_info(fofs: 0, len: 1048576, blk: 4221440)
1327.541931: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 0, start blkaddr = 0x406a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0
1327.541989: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 0, nr_pages = 512, dirty = 0, uptodate = 0
1327.542012: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 512, nr_pages = 512, dirty = 0, uptodate = 0
1327.542036: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1024, nr_pages = 512, dirty = 0, uptodate = 0
1327.542080: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1536, nr_pages = 512, dirty = 0, uptodate = 0
1327.542127: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2048, nr_pages = 512, dirty = 0, uptodate = 0
1327.542151: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2560, nr_pages = 512, dirty = 0, uptodate = 0
1327.542196: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3072, nr_pages = 512, dirty = 0, uptodate = 0
1327.542219: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3584, nr_pages = 512, dirty = 0, uptodate = 0
1327.542239: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33771520, size = 16777216
1327.542269: page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=8192 order=9 size=4096 async_size=2048 ra_pages=4096 mmap_miss=0 prev_pos=-1
1327.542289: page_cache_ra_order: dev=252:16 ino=e index=4096 order=9 size=4096 async_size=2048 ra_pages=4096
1327.544485: f2fs_readpages: dev = (252,16), ino = 14, start = 4096 nrpage = 4096
1327.544521: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 4096, type = Read
1327.544521: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 4096, read_ext_info(fofs: 0, len: 1048576, blk: 4221440)
1327.544522: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 4096, start blkaddr = 0x407a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0
1327.544550: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4096, nr_pages = 512, dirty = 0, uptodate = 0
1327.544575: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4608, nr_pages = 512, dirty = 0, uptodate = 0
1327.544601: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5120, nr_pages = 512, dirty = 0, uptodate = 0
1327.544647: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5632, nr_pages = 512, dirty = 0, uptodate = 0
1327.544692: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6144, nr_pages = 512, dirty = 0, uptodate = 0
1327.544734: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6656, nr_pages = 512, dirty = 0, uptodate = 0
1327.544777: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7168, nr_pages = 512, dirty = 0, uptodate = 0
1327.544805: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7680, nr_pages = 512, dirty = 0, uptodate = 0
1327.544826: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33804288, size = 16777216
1327.544852: f2fs_preload_pages_end: dev = (252,16), ino = 14, i_size = 4294967296 start: 8192, end: 8191
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
fs/f2fs/data.c | 1 +
include/trace/events/f2fs.h | 12 +++++++++++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8f433677c49d..a0433c8a4d84 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2521,6 +2521,7 @@ static int f2fs_read_data_large_folio(struct inode *inode,
index++;
offset++;
}
+ trace_f2fs_read_folio(folio, DATA);
if (rac) {
folio = readahead_folio(rac);
goto next_folio;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e00611ead024..d406b047c50b 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1349,6 +1349,7 @@ DECLARE_EVENT_CLASS(f2fs__folio,
__field(int, type)
__field(int, dir)
__field(pgoff_t, index)
+ __field(pgoff_t, nrpages)
__field(int, dirty)
__field(int, uptodate)
),
@@ -1359,16 +1360,18 @@ DECLARE_EVENT_CLASS(f2fs__folio,
__entry->type = type;
__entry->dir = S_ISDIR(folio->mapping->host->i_mode);
__entry->index = folio->index;
+ __entry->nrpages= folio_nr_pages(folio);
__entry->dirty = folio_test_dirty(folio);
__entry->uptodate = folio_test_uptodate(folio);
),
- TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, "
+ TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, nr_pages = %lu, "
"dirty = %d, uptodate = %d",
show_dev_ino(__entry),
show_block_type(__entry->type),
show_file_type(__entry->dir),
(unsigned long)__entry->index,
+ (unsigned long)__entry->nrpages,
__entry->dirty,
__entry->uptodate)
);
@@ -1394,6 +1397,13 @@ DEFINE_EVENT(f2fs__folio, f2fs_readpage,
TP_ARGS(folio, type)
);
+DEFINE_EVENT(f2fs__folio, f2fs_read_folio,
+
+ TP_PROTO(struct folio *folio, int type),
+
+ TP_ARGS(folio, type)
+);
+
DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty,
TP_PROTO(struct folio *folio, int type),
--
2.52.0.487.g5c8c507ade-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
2025-11-20 23:54 [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Jaegeuk Kim
2025-11-20 23:54 ` [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission Jaegeuk Kim
@ 2025-11-21 10:20 ` Chao Yu
2025-11-22 1:17 ` Jaegeuk Kim
2025-12-01 19:31 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
2025-11-22 1:18 ` [PATCH 1/2 v2] " Jaegeuk Kim
2025-12-16 19:20 ` [f2fs-dev] [PATCH 1/2] " patchwork-bot+f2fs
3 siblings, 2 replies; 20+ messages in thread
From: Chao Yu @ 2025-11-21 10:20 UTC (permalink / raw)
To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel; +Cc: chao
On 11/21/2025 7:54 AM, Jaegeuk Kim via Linux-f2fs-devel wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> fs/f2fs/data.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++--
> fs/f2fs/f2fs.h | 16 ++++
> fs/f2fs/inode.c | 6 +-
> 3 files changed, 257 insertions(+), 10 deletions(-)
>
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 48c20386f031..8f433677c49d 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -31,9 +31,15 @@
>
> static struct kmem_cache *bio_post_read_ctx_cache;
> static struct kmem_cache *bio_entry_slab;
> +static struct kmem_cache *ffs_entry_slab;
> static mempool_t *bio_post_read_ctx_pool;
> static struct bio_set f2fs_bioset;
>
> +struct f2fs_folio_state {
> + spinlock_t state_lock;
> + unsigned int read_pages_pending;
> +};
> +
> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>
> int __init f2fs_init_bioset(void)
> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> {
> struct folio_iter fi;
> struct bio_post_read_ctx *ctx = bio->bi_private;
> + unsigned long flags;
>
> bio_for_each_folio_all(fi, bio) {
> struct folio *folio = fi.folio;
> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> + bool finished = true;
>
> - if (f2fs_is_compressed_page(folio)) {
> + if (!folio_test_large(folio) &&
> + f2fs_is_compressed_page(folio)) {
> if (ctx && !ctx->decompression_attempted)
> f2fs_end_read_compressed_page(folio, true, 0,
> in_task);
> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> bio->bi_status = BLK_STS_IOERR;
> }
>
> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> + if (folio_test_large(folio)) {
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + spin_lock_irqsave(&ffs->state_lock, flags);
> + ffs->read_pages_pending -= nr_pages;
> + finished = !ffs->read_pages_pending;
> + spin_unlock_irqrestore(&ffs->state_lock, flags);
> + }
> +
> + while (nr_pages--)
> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> +
> + if (finished)
> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> }
>
> if (ctx)
> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> enum page_type type)
> {
> + if (!bio)
> + return;
> +
> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>
> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> struct dnode_of_data dn;
> struct folio *folio;
> int err;
> -
> +retry:
> folio = f2fs_grab_cache_folio(mapping, index, for_write);
> if (IS_ERR(folio))
> return folio;
>
> + if (folio_test_large(folio)) {
> + pgoff_t folio_index = mapping_align_index(mapping, index);
> +
> + f2fs_folio_put(folio, true);
> + invalidate_inode_pages2_range(mapping, folio_index,
> + folio_index + folio_nr_pages(folio) - 1);
> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> + goto retry;
> + }
Do we need to move above check into f2fs_grab_cache_folio()? as we call
f2fs_grab_cache_folio() in a lot of place.
> +
> if (f2fs_lookup_read_extent_cache_block(inode, index,
> &dn.data_blkaddr)) {
> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> @@ -2341,6 +2376,177 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> }
> #endif
>
> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + if (ffs)
> + return ffs;
> +
> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> +
> + spin_lock_init(&ffs->state_lock);
> + folio_attach_private(folio, ffs);
> + return ffs;
> +}
> +
> +static void ffs_detach_free(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs;
> +
> + if (!folio_test_large(folio)) {
> + folio_detach_private(folio);
> + return;
> + }
> +
> + ffs = folio_detach_private(folio);
> + if (!ffs)
> + return;
> +
> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> + kmem_cache_free(ffs_entry_slab, ffs);
> +}
> +
> +static int f2fs_read_data_large_folio(struct inode *inode,
> + struct readahead_control *rac, struct folio *folio)
> +{
> + struct bio *bio = NULL;
> + sector_t last_block_in_bio = 0;
> + struct f2fs_map_blocks map;
> + pgoff_t index, offset;> + unsigned max_nr_pages = rac ? readahead_count(rac) :
> + folio_nr_pages(folio);
> + unsigned nrpages;
> + struct f2fs_folio_state *ffs;
> + int ret = 0;
> +
> + if (f2fs_compressed_file(inode))
> + return -EOPNOTSUPP;
if (!IS_IMMUTABLE(inode))
return -EOPNOTSUPP;
We can configure inode after this check? Can we add some sanity check to prevent
enabling compress/immutable/quota if inode has already enabled large folio?
> +
> + memset(&map, 0, sizeof(map));
Can be replaced w/ struct f2fs_map_blocks map = {0, };
> + map.m_seg_type = NO_CHECK_TYPE;
> +
> + if (rac)
> + folio = readahead_folio(rac);
> +next_folio:
> + if (!folio)
> + goto out;
> +
> + index = folio->index;
> + offset = 0;
> + ffs = NULL;
> + nrpages = folio_nr_pages(folio);
> +
> + for (; nrpages; nrpages--) {
> + sector_t block_nr;
> + /*
> + * Map blocks using the previous result first.
> + */
> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> + index > map.m_lblk &&
> + index < (map.m_lblk + map.m_len))
> + goto got_it;
> +
> + /*
> + * Then do more f2fs_map_blocks() calls until we are
> + * done with this page.
> + */
> + memset(&map, 0, sizeof(map));
> + map.m_seg_type = NO_CHECK_TYPE;
> + map.m_lblk = index;
> + map.m_len = max_nr_pages;
> +
> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> + if (ret)
> + goto err_out;
> +got_it:
> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> + block_nr = map.m_pblk + index - map.m_lblk;
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> + DATA_GENERIC_ENHANCE_READ)) {
> + ret = -EFSCORRUPTED;
> + goto err_out;
> + }
> + } else {
> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> + if (f2fs_need_verity(inode, index) &&
> + !fsverity_verify_page(folio_file_page(folio,
> + index))) {
> + ret = -EIO;
> + goto err_out;
> + }
> + continue;
> + }
> +
> + /*
> + * This page will go to BIO. Do we need to send this
> + * BIO off first?
> + */
> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> + last_block_in_bio, block_nr) ||
> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> +submit_and_realloc:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + bio = NULL;
> + }
> + if (bio == NULL)
> + bio = f2fs_grab_read_bio(inode, block_nr,
> + max_nr_pages,
> + f2fs_ra_op_flags(rac),
> + index, false);
> +
> + /*
> + * If the page is under writeback, we need to wait for
> + * its completion to see the correct decrypted data.
> + */
> + f2fs_wait_on_block_writeback(inode, block_nr);
> +
> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> + offset << PAGE_SHIFT))> + goto submit_and_realloc;
> +
> + if (folio_test_large(folio)) {
> + ffs = ffs_find_or_alloc(folio);
> +
> + /* set the bitmap to wait */
> + spin_lock_irq(&ffs->state_lock);
> + ffs->read_pages_pending++;
> + spin_unlock_irq(&ffs->state_lock);
> + }
> +
> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> + F2FS_BLKSIZE);
> + last_block_in_bio = block_nr;
> + index++;
> + offset++;
> + }
> + if (rac) {
> + folio = readahead_folio(rac);
> + goto next_folio;
> + }
> +err_out:
> + /* Nothing was submitted. */
> + if (!bio) {
> + if (!ret)
> + folio_mark_uptodate(folio);
> + folio_unlock(folio);
> + return ret;
> + }
> +
> + if (ret) {
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> +
> + /* Wait bios and clear uptodate. */
> + folio_lock(folio);
> + folio_clear_uptodate(folio);
> + folio_unlock(folio);
> + }
> +out:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + return ret;
> +}
> +
> /*
> * This function was originally taken from fs/mpage.c, and customized for f2fs.
> * Major change was from block_size == page_size in f2fs by default.
> @@ -2366,9 +2572,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> pgoff_t index;
> #endif
> unsigned nr_pages = rac ? readahead_count(rac) : 1;
> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> unsigned max_nr_pages = nr_pages;
> int ret = 0;
>
> + if (mapping_large_folio_support(mapping))
> + return f2fs_read_data_large_folio(inode, rac, folio);
> +
> #ifdef CONFIG_F2FS_FS_COMPRESSION
> if (f2fs_compressed_file(inode)) {
> index = rac ? readahead_index(rac) : folio->index;
> @@ -2459,8 +2669,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> }
> #endif
> }
> - if (bio)
> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> return ret;
> }
>
> @@ -3747,7 +3956,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> f2fs_remove_dirty_inode(inode);
> }
> }
> - folio_detach_private(folio);
> +
> + if (offset || length != folio_size(folio))
> + return;
> +
> + folio_cancel_dirty(folio);
> + ffs_detach_free(folio);
> }
>
> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> @@ -3756,7 +3970,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> if (folio_test_dirty(folio))
> return false;
>
> - folio_detach_private(folio);
> + ffs_detach_free(folio);
> return true;
> }
>
> @@ -4162,12 +4376,25 @@ int __init f2fs_init_bio_entry_cache(void)
> {
> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> sizeof(struct bio_entry));
> - return bio_entry_slab ? 0 : -ENOMEM;
> +
> + if (!bio_entry_slab)
> + return -ENOMEM;
> +
> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> + sizeof(struct f2fs_folio_state));
> +
> + if (!ffs_entry_slab) {
> + kmem_cache_destroy(bio_entry_slab);
> + return -ENOMEM;
> + }
> +
> + return 0;
> }
>
> void f2fs_destroy_bio_entry_cache(void)
> {
> kmem_cache_destroy(bio_entry_slab);
> + kmem_cache_destroy(ffs_entry_slab);
> }
>
> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index dffe8958b580..3340db04a7c2 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -4916,6 +4916,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> return false;
> }
>
> +static inline bool f2fs_quota_file(struct inode *inode)
> +{
> +#ifdef CONFIG_QUOTA
> + int i;
> +
> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> + return false;
> +
> + for (i = 0; i < MAXQUOTAS; i++) {
> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> + return true;
> + }
> +#endif
> + return false;
> +}
> +
> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> {
> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index e2405b79b3cc..9162154d5211 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> if (ret)
> goto bad_inode;
> make_now:
> + f2fs_set_inode_flags(inode);
> +
> if (ino == F2FS_NODE_INO(sbi)) {
> inode->i_mapping->a_ops = &f2fs_node_aops;
> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> inode->i_op = &f2fs_file_inode_operations;
> inode->i_fop = &f2fs_file_operations;
> inode->i_mapping->a_ops = &f2fs_dblock_aops;
> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> + !f2fs_quota_file(inode))
> + mapping_set_folio_min_order(inode->i_mapping, 0);
> } else if (S_ISDIR(inode->i_mode)) {
> inode->i_op = &f2fs_dir_inode_operations;
> inode->i_fop = &f2fs_dir_operations;
> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> ret = -EIO;
> goto bad_inode;
> }
> - f2fs_set_inode_flags(inode);
>
> unlock_new_inode(inode);
> trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission
2025-11-20 23:54 ` [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission Jaegeuk Kim
@ 2025-11-21 10:23 ` Chao Yu
0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2025-11-21 10:23 UTC (permalink / raw)
To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel; +Cc: chao
On 11/21/2025 7:54 AM, Jaegeuk Kim via Linux-f2fs-devel wrote:
> For example,
>
> 1327.539878: f2fs_preload_pages_start: dev = (252,16), ino = 14, i_size = 4294967296 start: 0, end: 8191
> 1327.539878: page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=8192 order=9 size=0 async_size=0 ra_pages=4096 mmap_miss=0 prev_pos=-1
> 1327.539879: page_cache_ra_order: dev=252:16 ino=e index=0 order=9 size=4096 async_size=2048 ra_pages=4096
> 1327.541895: f2fs_readpages: dev = (252,16), ino = 14, start = 0 nrpage = 4096
> 1327.541930: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 0, type = Read
> 1327.541931: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 0, read_ext_info(fofs: 0, len: 1048576, blk: 4221440)
> 1327.541931: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 0, start blkaddr = 0x406a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0
> 1327.541989: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 0, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542012: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 512, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542036: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1024, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542080: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 1536, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542127: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2048, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542151: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 2560, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542196: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3072, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542219: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 3584, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.542239: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33771520, size = 16777216
> 1327.542269: page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=8192 order=9 size=4096 async_size=2048 ra_pages=4096 mmap_miss=0 prev_pos=-1
> 1327.542289: page_cache_ra_order: dev=252:16 ino=e index=4096 order=9 size=4096 async_size=2048 ra_pages=4096
> 1327.544485: f2fs_readpages: dev = (252,16), ino = 14, start = 4096 nrpage = 4096
> 1327.544521: f2fs_lookup_extent_tree_start: dev = (252,16), ino = 14, pgofs = 4096, type = Read
> 1327.544521: f2fs_lookup_read_extent_tree_end: dev = (252,16), ino = 14, pgofs = 4096, read_ext_info(fofs: 0, len: 1048576, blk: 4221440)
> 1327.544522: f2fs_map_blocks: dev = (252,16), ino = 14, file offset = 4096, start blkaddr = 0x407a00, len = 0x1000, flags = 2, seg_type = 8, may_create = 0, multidevice = 0, flag = 0, err = 0
> 1327.544550: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4096, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544575: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 4608, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544601: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5120, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544647: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 5632, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544692: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6144, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544734: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 6656, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544777: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7168, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544805: f2fs_read_folio: dev = (252,16), ino = 14, DATA, FILE, index = 7680, nr_pages = 512, dirty = 0, uptodate = 0
> 1327.544826: f2fs_submit_read_bio: dev = (252,16)/(252,16), rw = READ(R), DATA, sector = 33804288, size = 16777216
> 1327.544852: f2fs_preload_pages_end: dev = (252,16), ino = 14, i_size = 4294967296 start: 8192, end: 8191
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Thanks,
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
2025-11-21 10:20 ` [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Chao Yu
@ 2025-11-22 1:17 ` Jaegeuk Kim
2025-11-25 1:38 ` Chao Yu
2025-12-01 19:31 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
1 sibling, 1 reply; 20+ messages in thread
From: Jaegeuk Kim @ 2025-11-22 1:17 UTC (permalink / raw)
To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel
On 11/21, Chao Yu wrote:
> On 11/21/2025 7:54 AM, Jaegeuk Kim via Linux-f2fs-devel wrote:
> > This patch enables large folio for limited case where we can get the high-order
> > memory allocation. It supports the encrypted and fsverity files, which are
> > essential for Android environment.
> >
> > How to test:
> > - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> > - f2fs_io setflags immutable /mnt/test/test
> > - echo 3 > /proc/sys/vm/drop_caches
> > : to reload inode with large folio
> > - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
> >
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> > fs/f2fs/data.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++--
> > fs/f2fs/f2fs.h | 16 ++++
> > fs/f2fs/inode.c | 6 +-
> > 3 files changed, 257 insertions(+), 10 deletions(-)
> >
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 48c20386f031..8f433677c49d 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -31,9 +31,15 @@
> > static struct kmem_cache *bio_post_read_ctx_cache;
> > static struct kmem_cache *bio_entry_slab;
> > +static struct kmem_cache *ffs_entry_slab;
> > static mempool_t *bio_post_read_ctx_pool;
> > static struct bio_set f2fs_bioset;
> > +struct f2fs_folio_state {
> > + spinlock_t state_lock;
> > + unsigned int read_pages_pending;
> > +};
> > +
> > #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
> > int __init f2fs_init_bioset(void)
> > @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> > {
> > struct folio_iter fi;
> > struct bio_post_read_ctx *ctx = bio->bi_private;
> > + unsigned long flags;
> > bio_for_each_folio_all(fi, bio) {
> > struct folio *folio = fi.folio;
> > + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> > + bool finished = true;
> > - if (f2fs_is_compressed_page(folio)) {
> > + if (!folio_test_large(folio) &&
> > + f2fs_is_compressed_page(folio)) {
> > if (ctx && !ctx->decompression_attempted)
> > f2fs_end_read_compressed_page(folio, true, 0,
> > in_task);
> > @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> > bio->bi_status = BLK_STS_IOERR;
> > }
> > - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> > - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> > + if (folio_test_large(folio)) {
> > + struct f2fs_folio_state *ffs = folio->private;
> > +
> > + spin_lock_irqsave(&ffs->state_lock, flags);
> > + ffs->read_pages_pending -= nr_pages;
> > + finished = !ffs->read_pages_pending;
> > + spin_unlock_irqrestore(&ffs->state_lock, flags);
> > + }
> > +
> > + while (nr_pages--)
> > + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> > +
> > + if (finished)
> > + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> > }
> > if (ctx)
> > @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> > void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> > enum page_type type)
> > {
> > + if (!bio)
> > + return;
> > +
> > WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> > trace_f2fs_submit_read_bio(sbi->sb, type, bio);
> > @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> > struct dnode_of_data dn;
> > struct folio *folio;
> > int err;
> > -
> > +retry:
> > folio = f2fs_grab_cache_folio(mapping, index, for_write);
> > if (IS_ERR(folio))
> > return folio;
> > + if (folio_test_large(folio)) {
> > + pgoff_t folio_index = mapping_align_index(mapping, index);
> > +
> > + f2fs_folio_put(folio, true);
> > + invalidate_inode_pages2_range(mapping, folio_index,
> > + folio_index + folio_nr_pages(folio) - 1);
> > + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> > + goto retry;
> > + }
>
> Do we need to move above check into f2fs_grab_cache_folio()? as we call
> f2fs_grab_cache_folio() in a lot of place.
We're okay with high-order allocation in other path, but I think this is
the only problem since it goes to GC writes.
>
> > +
> > if (f2fs_lookup_read_extent_cache_block(inode, index,
> > &dn.data_blkaddr)) {
> > if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> > @@ -2341,6 +2376,177 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> > }
> > #endif
> > +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> > +{
> > + struct f2fs_folio_state *ffs = folio->private;
> > +
> > + if (ffs)
> > + return ffs;
> > +
> > + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> > +
> > + spin_lock_init(&ffs->state_lock);
> > + folio_attach_private(folio, ffs);
> > + return ffs;
> > +}
> > +
> > +static void ffs_detach_free(struct folio *folio)
> > +{
> > + struct f2fs_folio_state *ffs;
> > +
> > + if (!folio_test_large(folio)) {
> > + folio_detach_private(folio);
> > + return;
> > + }
> > +
> > + ffs = folio_detach_private(folio);
> > + if (!ffs)
> > + return;
> > +
> > + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> > + kmem_cache_free(ffs_entry_slab, ffs);
> > +}
> > +
> > +static int f2fs_read_data_large_folio(struct inode *inode,
> > + struct readahead_control *rac, struct folio *folio)
> > +{
> > + struct bio *bio = NULL;
> > + sector_t last_block_in_bio = 0;
> > + struct f2fs_map_blocks map;
> > + pgoff_t index, offset;> + unsigned max_nr_pages = rac ? readahead_count(rac) :
> > + folio_nr_pages(folio);
> > + unsigned nrpages;
> > + struct f2fs_folio_state *ffs;
> > + int ret = 0;
> > +
> > + if (f2fs_compressed_file(inode))
> > + return -EOPNOTSUPP;
>
> if (!IS_IMMUTABLE(inode))
> return -EOPNOTSUPP;
>
> We can configure inode after this check? Can we add some sanity check to prevent
> enabling compress/immutable/quota if inode has already enabled large folio?
I think immutable will prevent most of the changes?
>
> > +
> > + memset(&map, 0, sizeof(map));
>
> Can be replaced w/ struct f2fs_map_blocks map = {0, };
>
> > + map.m_seg_type = NO_CHECK_TYPE;
> > +
> > + if (rac)
> > + folio = readahead_folio(rac);
> > +next_folio:
> > + if (!folio)
> > + goto out;
> > +
> > + index = folio->index;
> > + offset = 0;
> > + ffs = NULL;
> > + nrpages = folio_nr_pages(folio);
> > +
> > + for (; nrpages; nrpages--) {
> > + sector_t block_nr;
> > + /*
> > + * Map blocks using the previous result first.
> > + */
> > + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> > + index > map.m_lblk &&
> > + index < (map.m_lblk + map.m_len))
> > + goto got_it;
> > +
> > + /*
> > + * Then do more f2fs_map_blocks() calls until we are
> > + * done with this page.
> > + */
> > + memset(&map, 0, sizeof(map));
> > + map.m_seg_type = NO_CHECK_TYPE;
> > + map.m_lblk = index;
> > + map.m_len = max_nr_pages;
> > +
> > + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> > + if (ret)
> > + goto err_out;
> > +got_it:
> > + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> > + block_nr = map.m_pblk + index - map.m_lblk;
> > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> > + DATA_GENERIC_ENHANCE_READ)) {
> > + ret = -EFSCORRUPTED;
> > + goto err_out;
> > + }
> > + } else {
> > + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> > + if (f2fs_need_verity(inode, index) &&
> > + !fsverity_verify_page(folio_file_page(folio,
> > + index))) {
> > + ret = -EIO;
> > + goto err_out;
> > + }
> > + continue;
> > + }
> > +
> > + /*
> > + * This page will go to BIO. Do we need to send this
> > + * BIO off first?
> > + */
> > + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> > + last_block_in_bio, block_nr) ||
> > + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> > +submit_and_realloc:
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + bio = NULL;
> > + }
> > + if (bio == NULL)
> > + bio = f2fs_grab_read_bio(inode, block_nr,
> > + max_nr_pages,
> > + f2fs_ra_op_flags(rac),
> > + index, false);
> > +
> > + /*
> > + * If the page is under writeback, we need to wait for
> > + * its completion to see the correct decrypted data.
> > + */
> > + f2fs_wait_on_block_writeback(inode, block_nr);
> > +
> > + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> > + offset << PAGE_SHIFT))> + goto submit_and_realloc;
> > +
> > + if (folio_test_large(folio)) {
> > + ffs = ffs_find_or_alloc(folio);
> > +
> > + /* set the bitmap to wait */
> > + spin_lock_irq(&ffs->state_lock);
> > + ffs->read_pages_pending++;
> > + spin_unlock_irq(&ffs->state_lock);
> > + }
> > +
> > + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> > + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> > + F2FS_BLKSIZE);
> > + last_block_in_bio = block_nr;
> > + index++;
> > + offset++;
> > + }
> > + if (rac) {
> > + folio = readahead_folio(rac);
> > + goto next_folio;
> > + }
> > +err_out:
> > + /* Nothing was submitted. */
> > + if (!bio) {
> > + if (!ret)
> > + folio_mark_uptodate(folio);
> > + folio_unlock(folio);
> > + return ret;
> > + }
> > +
> > + if (ret) {
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > +
> > + /* Wait bios and clear uptodate. */
> > + folio_lock(folio);
> > + folio_clear_uptodate(folio);
> > + folio_unlock(folio);
> > + }
> > +out:
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + return ret;
> > +}
> > +
> > /*
> > * This function was originally taken from fs/mpage.c, and customized for f2fs.
> > * Major change was from block_size == page_size in f2fs by default.
> > @@ -2366,9 +2572,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> > pgoff_t index;
> > #endif
> > unsigned nr_pages = rac ? readahead_count(rac) : 1;
> > + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> > unsigned max_nr_pages = nr_pages;
> > int ret = 0;
> > + if (mapping_large_folio_support(mapping))
> > + return f2fs_read_data_large_folio(inode, rac, folio);
> > +
> > #ifdef CONFIG_F2FS_FS_COMPRESSION
> > if (f2fs_compressed_file(inode)) {
> > index = rac ? readahead_index(rac) : folio->index;
> > @@ -2459,8 +2669,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> > }
> > #endif
> > }
> > - if (bio)
> > - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > return ret;
> > }
> > @@ -3747,7 +3956,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> > f2fs_remove_dirty_inode(inode);
> > }
> > }
> > - folio_detach_private(folio);
> > +
> > + if (offset || length != folio_size(folio))
> > + return;
> > +
> > + folio_cancel_dirty(folio);
> > + ffs_detach_free(folio);
> > }
> > bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> > @@ -3756,7 +3970,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> > if (folio_test_dirty(folio))
> > return false;
> > - folio_detach_private(folio);
> > + ffs_detach_free(folio);
> > return true;
> > }
> > @@ -4162,12 +4376,25 @@ int __init f2fs_init_bio_entry_cache(void)
> > {
> > bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> > sizeof(struct bio_entry));
> > - return bio_entry_slab ? 0 : -ENOMEM;
> > +
> > + if (!bio_entry_slab)
> > + return -ENOMEM;
> > +
> > + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> > + sizeof(struct f2fs_folio_state));
> > +
> > + if (!ffs_entry_slab) {
> > + kmem_cache_destroy(bio_entry_slab);
> > + return -ENOMEM;
> > + }
> > +
> > + return 0;
> > }
> > void f2fs_destroy_bio_entry_cache(void)
> > {
> > kmem_cache_destroy(bio_entry_slab);
> > + kmem_cache_destroy(ffs_entry_slab);
> > }
> > static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index dffe8958b580..3340db04a7c2 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -4916,6 +4916,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> > return false;
> > }
> > +static inline bool f2fs_quota_file(struct inode *inode)
> > +{
> > +#ifdef CONFIG_QUOTA
> > + int i;
> > +
> > + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> > + return false;
> > +
> > + for (i = 0; i < MAXQUOTAS; i++) {
> > + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> > + return true;
> > + }
> > +#endif
> > + return false;
> > +}
> > +
> > static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> > {
> > return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> > diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> > index e2405b79b3cc..9162154d5211 100644
> > --- a/fs/f2fs/inode.c
> > +++ b/fs/f2fs/inode.c
> > @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > if (ret)
> > goto bad_inode;
> > make_now:
> > + f2fs_set_inode_flags(inode);
> > +
> > if (ino == F2FS_NODE_INO(sbi)) {
> > inode->i_mapping->a_ops = &f2fs_node_aops;
> > mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> > @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > inode->i_op = &f2fs_file_inode_operations;
> > inode->i_fop = &f2fs_file_operations;
> > inode->i_mapping->a_ops = &f2fs_dblock_aops;
> > + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> > + !f2fs_quota_file(inode))
> > + mapping_set_folio_min_order(inode->i_mapping, 0);
> > } else if (S_ISDIR(inode->i_mode)) {
> > inode->i_op = &f2fs_dir_inode_operations;
> > inode->i_fop = &f2fs_dir_operations;
> > @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > ret = -EIO;
> > goto bad_inode;
> > }
> > - f2fs_set_inode_flags(inode);
> > unlock_new_inode(inode);
> > trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2 v2] f2fs: support large folio for immutable non-compressed case
2025-11-20 23:54 [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Jaegeuk Kim
2025-11-20 23:54 ` [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission Jaegeuk Kim
2025-11-21 10:20 ` [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Chao Yu
@ 2025-11-22 1:18 ` Jaegeuk Kim
2025-12-16 19:20 ` [f2fs-dev] [PATCH 1/2] " patchwork-bot+f2fs
3 siblings, 0 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2025-11-22 1:18 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel
This patch enables large folio for limited case where we can get the high-order
memory allocation. It supports the encrypted and fsverity files, which are
essential for Android environment.
How to test:
- dd if=/dev/zero of=/mnt/test/test bs=1G count=4
- f2fs_io setflags immutable /mnt/test/test
- echo 3 > /proc/sys/vm/drop_caches
: to reload inode with large folio
- f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
Change log from v1:
- remove memset
- check immutable support
fs/f2fs/data.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/f2fs/f2fs.h | 16 ++++
fs/f2fs/inode.c | 6 +-
3 files changed, 259 insertions(+), 10 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 48c20386f031..acc4ef511bfb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio->bi_status = BLK_STS_IOERR;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (!IS_IMMUTABLE(inode))
+ return -EOPNOTSUPP;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5f104518c414..71adfacaca45 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4921,6 +4921,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e2405b79b3cc..9162154d5211 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(inode))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
--
2.52.0.487.g5c8c507ade-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
2025-11-22 1:17 ` Jaegeuk Kim
@ 2025-11-25 1:38 ` Chao Yu
0 siblings, 0 replies; 20+ messages in thread
From: Chao Yu @ 2025-11-25 1:38 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: chao, linux-kernel, linux-f2fs-devel
On 11/22/2025 9:17 AM, Jaegeuk Kim wrote:
> On 11/21, Chao Yu wrote:
>> On 11/21/2025 7:54 AM, Jaegeuk Kim via Linux-f2fs-devel wrote:
>>> This patch enables large folio for limited case where we can get the high-order
>>> memory allocation. It supports the encrypted and fsverity files, which are
>>> essential for Android environment.
>>>
>>> How to test:
>>> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
>>> - f2fs_io setflags immutable /mnt/test/test
>>> - echo 3 > /proc/sys/vm/drop_caches
>>> : to reload inode with large folio
>>> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>>>
>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>> ---
>>> fs/f2fs/data.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++--
>>> fs/f2fs/f2fs.h | 16 ++++
>>> fs/f2fs/inode.c | 6 +-
>>> 3 files changed, 257 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>>> index 48c20386f031..8f433677c49d 100644
>>> --- a/fs/f2fs/data.c
>>> +++ b/fs/f2fs/data.c
>>> @@ -31,9 +31,15 @@
>>> static struct kmem_cache *bio_post_read_ctx_cache;
>>> static struct kmem_cache *bio_entry_slab;
>>> +static struct kmem_cache *ffs_entry_slab;
>>> static mempool_t *bio_post_read_ctx_pool;
>>> static struct bio_set f2fs_bioset;
>>> +struct f2fs_folio_state {
>>> + spinlock_t state_lock;
>>> + unsigned int read_pages_pending;
>>> +};
>>> +
>>> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>>> int __init f2fs_init_bioset(void)
>>> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
>>> {
>>> struct folio_iter fi;
>>> struct bio_post_read_ctx *ctx = bio->bi_private;
>>> + unsigned long flags;
>>> bio_for_each_folio_all(fi, bio) {
>>> struct folio *folio = fi.folio;
>>> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
>>> + bool finished = true;
>>> - if (f2fs_is_compressed_page(folio)) {
>>> + if (!folio_test_large(folio) &&
>>> + f2fs_is_compressed_page(folio)) {
>>> if (ctx && !ctx->decompression_attempted)
>>> f2fs_end_read_compressed_page(folio, true, 0,
>>> in_task);
>>> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
>>> bio->bi_status = BLK_STS_IOERR;
>>> }
>>> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
>>> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
>>> + if (folio_test_large(folio)) {
>>> + struct f2fs_folio_state *ffs = folio->private;
>>> +
>>> + spin_lock_irqsave(&ffs->state_lock, flags);
>>> + ffs->read_pages_pending -= nr_pages;
>>> + finished = !ffs->read_pages_pending;
>>> + spin_unlock_irqrestore(&ffs->state_lock, flags);
>>> + }
>>> +
>>> + while (nr_pages--)
>>> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
>>> +
>>> + if (finished)
>>> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
>>> }
>>> if (ctx)
>>> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
>>> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
>>> enum page_type type)
>>> {
>>> + if (!bio)
>>> + return;
>>> +
>>> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
>>> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>>> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
>>> struct dnode_of_data dn;
>>> struct folio *folio;
>>> int err;
>>> -
>>> +retry:
>>> folio = f2fs_grab_cache_folio(mapping, index, for_write);
>>> if (IS_ERR(folio))
>>> return folio;
>>> + if (folio_test_large(folio)) {
>>> + pgoff_t folio_index = mapping_align_index(mapping, index);
>>> +
>>> + f2fs_folio_put(folio, true);
>>> + invalidate_inode_pages2_range(mapping, folio_index,
>>> + folio_index + folio_nr_pages(folio) - 1);
>>> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
>>> + goto retry;
>>> + }
>>
>> Do we need to move above check into f2fs_grab_cache_folio()? as we call
>> f2fs_grab_cache_folio() in a lot of place.
>
> We're okay with high-order allocation in other path, but I think this is
> the only problem since it goes to GC writes.
Oh, right.
>
>>
>>> +
>>> if (f2fs_lookup_read_extent_cache_block(inode, index,
>>> &dn.data_blkaddr)) {
>>> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
>>> @@ -2341,6 +2376,177 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>>> }
>>> #endif
>>> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
>>> +{
>>> + struct f2fs_folio_state *ffs = folio->private;
>>> +
>>> + if (ffs)
>>> + return ffs;
>>> +
>>> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
>>> +
>>> + spin_lock_init(&ffs->state_lock);
>>> + folio_attach_private(folio, ffs);
>>> + return ffs;
>>> +}
>>> +
>>> +static void ffs_detach_free(struct folio *folio)
>>> +{
>>> + struct f2fs_folio_state *ffs;
>>> +
>>> + if (!folio_test_large(folio)) {
>>> + folio_detach_private(folio);
>>> + return;
>>> + }
>>> +
>>> + ffs = folio_detach_private(folio);
>>> + if (!ffs)
>>> + return;
>>> +
>>> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
>>> + kmem_cache_free(ffs_entry_slab, ffs);
>>> +}
>>> +
>>> +static int f2fs_read_data_large_folio(struct inode *inode,
>>> + struct readahead_control *rac, struct folio *folio)
>>> +{
>>> + struct bio *bio = NULL;
>>> + sector_t last_block_in_bio = 0;
>>> + struct f2fs_map_blocks map;
>>> + pgoff_t index, offset;> + unsigned max_nr_pages = rac ? readahead_count(rac) :
>>> + folio_nr_pages(folio);
>>> + unsigned nrpages;
>>> + struct f2fs_folio_state *ffs;
>>> + int ret = 0;
>>> +
>>> + if (f2fs_compressed_file(inode))
>>> + return -EOPNOTSUPP;
>>
>> if (!IS_IMMUTABLE(inode))
>> return -EOPNOTSUPP;
>>
>> We can configure inode after this check? Can we add some sanity check to prevent
>> enabling compress/immutable/quota if inode has already enabled large folio?
>
> I think immutable will prevent most of the changes?
Someone can drop immutable flag after above check condition in parallel?
Do we need to cover read() w/ inode_lock_shared() to prevent f2fs_fileattr_set
/w non-immutable flag concurrently?
Thanks,
>
>>
>>> +
>>> + memset(&map, 0, sizeof(map));
>>
>> Can be replaced w/ struct f2fs_map_blocks map = {0, };
>>
>>> + map.m_seg_type = NO_CHECK_TYPE;
>>> +
>>> + if (rac)
>>> + folio = readahead_folio(rac);
>>> +next_folio:
>>> + if (!folio)
>>> + goto out;
>>> +
>>> + index = folio->index;
>>> + offset = 0;
>>> + ffs = NULL;
>>> + nrpages = folio_nr_pages(folio);
>>> +
>>> + for (; nrpages; nrpages--) {
>>> + sector_t block_nr;
>>> + /*
>>> + * Map blocks using the previous result first.
>>> + */
>>> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
>>> + index > map.m_lblk &&
>>> + index < (map.m_lblk + map.m_len))
>>> + goto got_it;
>>> +
>>> + /*
>>> + * Then do more f2fs_map_blocks() calls until we are
>>> + * done with this page.
>>> + */
>>> + memset(&map, 0, sizeof(map));
>>> + map.m_seg_type = NO_CHECK_TYPE;
>>> + map.m_lblk = index;
>>> + map.m_len = max_nr_pages;
>>> +
>>> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
>>> + if (ret)
>>> + goto err_out;
>>> +got_it:
>>> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
>>> + block_nr = map.m_pblk + index - map.m_lblk;
>>> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
>>> + DATA_GENERIC_ENHANCE_READ)) {
>>> + ret = -EFSCORRUPTED;
>>> + goto err_out;
>>> + }
>>> + } else {
>>> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
>>> + if (f2fs_need_verity(inode, index) &&
>>> + !fsverity_verify_page(folio_file_page(folio,
>>> + index))) {
>>> + ret = -EIO;
>>> + goto err_out;
>>> + }
>>> + continue;
>>> + }
>>> +
>>> + /*
>>> + * This page will go to BIO. Do we need to send this
>>> + * BIO off first?
>>> + */
>>> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
>>> + last_block_in_bio, block_nr) ||
>>> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
>>> +submit_and_realloc:
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + bio = NULL;
>>> + }
>>> + if (bio == NULL)
>>> + bio = f2fs_grab_read_bio(inode, block_nr,
>>> + max_nr_pages,
>>> + f2fs_ra_op_flags(rac),
>>> + index, false);
>>> +
>>> + /*
>>> + * If the page is under writeback, we need to wait for
>>> + * its completion to see the correct decrypted data.
>>> + */
>>> + f2fs_wait_on_block_writeback(inode, block_nr);
>>> +
>>> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
>>> + offset << PAGE_SHIFT))> + goto submit_and_realloc;
>>> +
>>> + if (folio_test_large(folio)) {
>>> + ffs = ffs_find_or_alloc(folio);
>>> +
>>> + /* set the bitmap to wait */
>>> + spin_lock_irq(&ffs->state_lock);
>>> + ffs->read_pages_pending++;
>>> + spin_unlock_irq(&ffs->state_lock);
>>> + }
>>> +
>>> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
>>> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
>>> + F2FS_BLKSIZE);
>>> + last_block_in_bio = block_nr;
>>> + index++;
>>> + offset++;
>>> + }
>>> + if (rac) {
>>> + folio = readahead_folio(rac);
>>> + goto next_folio;
>>> + }
>>> +err_out:
>>> + /* Nothing was submitted. */
>>> + if (!bio) {
>>> + if (!ret)
>>> + folio_mark_uptodate(folio);
>>> + folio_unlock(folio);
>>> + return ret;
>>> + }
>>> +
>>> + if (ret) {
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> +
>>> + /* Wait bios and clear uptodate. */
>>> + folio_lock(folio);
>>> + folio_clear_uptodate(folio);
>>> + folio_unlock(folio);
>>> + }
>>> +out:
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + return ret;
>>> +}
>>> +
>>> /*
>>> * This function was originally taken from fs/mpage.c, and customized for f2fs.
>>> * Major change was from block_size == page_size in f2fs by default.
>>> @@ -2366,9 +2572,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
>>> pgoff_t index;
>>> #endif
>>> unsigned nr_pages = rac ? readahead_count(rac) : 1;
>>> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
>>> unsigned max_nr_pages = nr_pages;
>>> int ret = 0;
>>> + if (mapping_large_folio_support(mapping))
>>> + return f2fs_read_data_large_folio(inode, rac, folio);
>>> +
>>> #ifdef CONFIG_F2FS_FS_COMPRESSION
>>> if (f2fs_compressed_file(inode)) {
>>> index = rac ? readahead_index(rac) : folio->index;
>>> @@ -2459,8 +2669,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
>>> }
>>> #endif
>>> }
>>> - if (bio)
>>> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> return ret;
>>> }
>>> @@ -3747,7 +3956,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
>>> f2fs_remove_dirty_inode(inode);
>>> }
>>> }
>>> - folio_detach_private(folio);
>>> +
>>> + if (offset || length != folio_size(folio))
>>> + return;
>>> +
>>> + folio_cancel_dirty(folio);
>>> + ffs_detach_free(folio);
>>> }
>>> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
>>> @@ -3756,7 +3970,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
>>> if (folio_test_dirty(folio))
>>> return false;
>>> - folio_detach_private(folio);
>>> + ffs_detach_free(folio);
>>> return true;
>>> }
>>> @@ -4162,12 +4376,25 @@ int __init f2fs_init_bio_entry_cache(void)
>>> {
>>> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
>>> sizeof(struct bio_entry));
>>> - return bio_entry_slab ? 0 : -ENOMEM;
>>> +
>>> + if (!bio_entry_slab)
>>> + return -ENOMEM;
>>> +
>>> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
>>> + sizeof(struct f2fs_folio_state));
>>> +
>>> + if (!ffs_entry_slab) {
>>> + kmem_cache_destroy(bio_entry_slab);
>>> + return -ENOMEM;
>>> + }
>>> +
>>> + return 0;
>>> }
>>> void f2fs_destroy_bio_entry_cache(void)
>>> {
>>> kmem_cache_destroy(bio_entry_slab);
>>> + kmem_cache_destroy(ffs_entry_slab);
>>> }
>>> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index dffe8958b580..3340db04a7c2 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -4916,6 +4916,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
>>> return false;
>>> }
>>> +static inline bool f2fs_quota_file(struct inode *inode)
>>> +{
>>> +#ifdef CONFIG_QUOTA
>>> + int i;
>>> +
>>> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
>>> + return false;
>>> +
>>> + for (i = 0; i < MAXQUOTAS; i++) {
>>> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
>>> + return true;
>>> + }
>>> +#endif
>>> + return false;
>>> +}
>>> +
>>> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
>>> {
>>> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
>>> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
>>> index e2405b79b3cc..9162154d5211 100644
>>> --- a/fs/f2fs/inode.c
>>> +++ b/fs/f2fs/inode.c
>>> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> if (ret)
>>> goto bad_inode;
>>> make_now:
>>> + f2fs_set_inode_flags(inode);
>>> +
>>> if (ino == F2FS_NODE_INO(sbi)) {
>>> inode->i_mapping->a_ops = &f2fs_node_aops;
>>> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
>>> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> inode->i_op = &f2fs_file_inode_operations;
>>> inode->i_fop = &f2fs_file_operations;
>>> inode->i_mapping->a_ops = &f2fs_dblock_aops;
>>> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
>>> + !f2fs_quota_file(inode))
>>> + mapping_set_folio_min_order(inode->i_mapping, 0);
>>> } else if (S_ISDIR(inode->i_mode)) {
>>> inode->i_op = &f2fs_dir_inode_operations;
>>> inode->i_fop = &f2fs_dir_operations;
>>> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> ret = -EIO;
>>> goto bad_inode;
>>> }
>>> - f2fs_set_inode_flags(inode);
>>> unlock_new_inode(inode);
>>> trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v2] f2fs: support large folio for immutable non-compressed case
2025-11-21 10:20 ` [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Chao Yu
2025-11-22 1:17 ` Jaegeuk Kim
@ 2025-12-01 19:31 ` Jaegeuk Kim
2025-12-01 21:37 ` Chao Yu
2025-12-01 22:30 ` [f2fs-dev] [PATCH 1/2 v3] " Jaegeuk Kim
1 sibling, 2 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 19:31 UTC (permalink / raw)
To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel
This patch enables large folio for limited case where we can get the high-order
memory allocation. It supports the encrypted and fsverity files, which are
essential for Android environment.
How to test:
- dd if=/dev/zero of=/mnt/test/test bs=1G count=4
- f2fs_io setflags immutable /mnt/test/test
- echo 3 > /proc/sys/vm/drop_caches
: to reload inode with large folio
- f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
Change log from v1:
- return error when trying open an inode having large folio
fs/f2fs/data.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/f2fs/f2fs.h | 16 ++++
fs/f2fs/file.c | 4 +
fs/f2fs/inode.c | 6 +-
4 files changed, 263 insertions(+), 10 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 48c20386f031..acc4ef511bfb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio->bi_status = BLK_STS_IOERR;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (!IS_IMMUTABLE(inode))
+ return -EOPNOTSUPP;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5f104518c414..71adfacaca45 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4921,6 +4921,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d7047ca6b98d..e75e61ac50d7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
+ if (mapping_large_folio_support(inode->i_mapping) &&
+ filp->f_mode & FMODE_WRITE)
+ return -EOPNOTSUPP;
+
err = fsverity_file_open(inode, filp);
if (err)
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e2405b79b3cc..9162154d5211 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(inode))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v2] f2fs: support large folio for immutable non-compressed case
2025-12-01 19:31 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
@ 2025-12-01 21:37 ` Chao Yu
2025-12-01 22:30 ` [f2fs-dev] [PATCH 1/2 v3] " Jaegeuk Kim
1 sibling, 0 replies; 20+ messages in thread
From: Chao Yu @ 2025-12-01 21:37 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: chao, linux-kernel, linux-f2fs-devel
On 2025/12/2 03:31, Jaegeuk Kim wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
How about adding large folio usage in f2fs documentation for user guidance?
including how to enable/disable large folio feature in file.
Thanks,
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> Change log from v1:
> - return error when trying open an inode having large folio
>
> fs/f2fs/data.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++--
> fs/f2fs/f2fs.h | 16 ++++
> fs/f2fs/file.c | 4 +
> fs/f2fs/inode.c | 6 +-
> 4 files changed, 263 insertions(+), 10 deletions(-)
>
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 48c20386f031..acc4ef511bfb 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -31,9 +31,15 @@
>
> static struct kmem_cache *bio_post_read_ctx_cache;
> static struct kmem_cache *bio_entry_slab;
> +static struct kmem_cache *ffs_entry_slab;
> static mempool_t *bio_post_read_ctx_pool;
> static struct bio_set f2fs_bioset;
>
> +struct f2fs_folio_state {
> + spinlock_t state_lock;
> + unsigned int read_pages_pending;
> +};
> +
> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>
> int __init f2fs_init_bioset(void)
> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> {
> struct folio_iter fi;
> struct bio_post_read_ctx *ctx = bio->bi_private;
> + unsigned long flags;
>
> bio_for_each_folio_all(fi, bio) {
> struct folio *folio = fi.folio;
> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> + bool finished = true;
>
> - if (f2fs_is_compressed_page(folio)) {
> + if (!folio_test_large(folio) &&
> + f2fs_is_compressed_page(folio)) {
> if (ctx && !ctx->decompression_attempted)
> f2fs_end_read_compressed_page(folio, true, 0,
> in_task);
> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> bio->bi_status = BLK_STS_IOERR;
> }
>
> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> + if (folio_test_large(folio)) {
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + spin_lock_irqsave(&ffs->state_lock, flags);
> + ffs->read_pages_pending -= nr_pages;
> + finished = !ffs->read_pages_pending;
> + spin_unlock_irqrestore(&ffs->state_lock, flags);
> + }
> +
> + while (nr_pages--)
> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> +
> + if (finished)
> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> }
>
> if (ctx)
> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> enum page_type type)
> {
> + if (!bio)
> + return;
> +
> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>
> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> struct dnode_of_data dn;
> struct folio *folio;
> int err;
> -
> +retry:
> folio = f2fs_grab_cache_folio(mapping, index, for_write);
> if (IS_ERR(folio))
> return folio;
>
> + if (folio_test_large(folio)) {
> + pgoff_t folio_index = mapping_align_index(mapping, index);
> +
> + f2fs_folio_put(folio, true);
> + invalidate_inode_pages2_range(mapping, folio_index,
> + folio_index + folio_nr_pages(folio) - 1);
> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> + goto retry;
> + }
> +
> if (f2fs_lookup_read_extent_cache_block(inode, index,
> &dn.data_blkaddr)) {
> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> @@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> }
> #endif
>
> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + if (ffs)
> + return ffs;
> +
> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> +
> + spin_lock_init(&ffs->state_lock);
> + folio_attach_private(folio, ffs);
> + return ffs;
> +}
> +
> +static void ffs_detach_free(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs;
> +
> + if (!folio_test_large(folio)) {
> + folio_detach_private(folio);
> + return;
> + }
> +
> + ffs = folio_detach_private(folio);
> + if (!ffs)
> + return;
> +
> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> + kmem_cache_free(ffs_entry_slab, ffs);
> +}
> +
> +static int f2fs_read_data_large_folio(struct inode *inode,
> + struct readahead_control *rac, struct folio *folio)
> +{
> + struct bio *bio = NULL;
> + sector_t last_block_in_bio = 0;
> + struct f2fs_map_blocks map = {0, };
> + pgoff_t index, offset;
> + unsigned max_nr_pages = rac ? readahead_count(rac) :
> + folio_nr_pages(folio);
> + unsigned nrpages;
> + struct f2fs_folio_state *ffs;
> + int ret = 0;
> +
> + if (!IS_IMMUTABLE(inode))
> + return -EOPNOTSUPP;
> +
> + if (f2fs_compressed_file(inode))
> + return -EOPNOTSUPP;
> +
> + map.m_seg_type = NO_CHECK_TYPE;
> +
> + if (rac)
> + folio = readahead_folio(rac);
> +next_folio:
> + if (!folio)
> + goto out;
> +
> + index = folio->index;
> + offset = 0;
> + ffs = NULL;
> + nrpages = folio_nr_pages(folio);
> +
> + for (; nrpages; nrpages--) {
> + sector_t block_nr;
> + /*
> + * Map blocks using the previous result first.
> + */
> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> + index > map.m_lblk &&
> + index < (map.m_lblk + map.m_len))
> + goto got_it;
> +
> + /*
> + * Then do more f2fs_map_blocks() calls until we are
> + * done with this page.
> + */
> + memset(&map, 0, sizeof(map));
> + map.m_seg_type = NO_CHECK_TYPE;
> + map.m_lblk = index;
> + map.m_len = max_nr_pages;
> +
> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> + if (ret)
> + goto err_out;
> +got_it:
> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> + block_nr = map.m_pblk + index - map.m_lblk;
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> + DATA_GENERIC_ENHANCE_READ)) {
> + ret = -EFSCORRUPTED;
> + goto err_out;
> + }
> + } else {
> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> + if (f2fs_need_verity(inode, index) &&
> + !fsverity_verify_page(folio_file_page(folio,
> + index))) {
> + ret = -EIO;
> + goto err_out;
> + }
> + continue;
> + }
> +
> + /*
> + * This page will go to BIO. Do we need to send this
> + * BIO off first?
> + */
> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> + last_block_in_bio, block_nr) ||
> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> +submit_and_realloc:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + bio = NULL;
> + }
> + if (bio == NULL)
> + bio = f2fs_grab_read_bio(inode, block_nr,
> + max_nr_pages,
> + f2fs_ra_op_flags(rac),
> + index, false);
> +
> + /*
> + * If the page is under writeback, we need to wait for
> + * its completion to see the correct decrypted data.
> + */
> + f2fs_wait_on_block_writeback(inode, block_nr);
> +
> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> + offset << PAGE_SHIFT))
> + goto submit_and_realloc;
> +
> + if (folio_test_large(folio)) {
> + ffs = ffs_find_or_alloc(folio);
> +
> + /* set the bitmap to wait */
> + spin_lock_irq(&ffs->state_lock);
> + ffs->read_pages_pending++;
> + spin_unlock_irq(&ffs->state_lock);
> + }
> +
> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> + F2FS_BLKSIZE);
> + last_block_in_bio = block_nr;
> + index++;
> + offset++;
> + }
> + if (rac) {
> + folio = readahead_folio(rac);
> + goto next_folio;
> + }
> +err_out:
> + /* Nothing was submitted. */
> + if (!bio) {
> + if (!ret)
> + folio_mark_uptodate(folio);
> + folio_unlock(folio);
> + return ret;
> + }
> +
> + if (ret) {
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> +
> + /* Wait bios and clear uptodate. */
> + folio_lock(folio);
> + folio_clear_uptodate(folio);
> + folio_unlock(folio);
> + }
> +out:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + return ret;
> +}
> +
> /*
> * This function was originally taken from fs/mpage.c, and customized for f2fs.
> * Major change was from block_size == page_size in f2fs by default.
> @@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> pgoff_t index;
> #endif
> unsigned nr_pages = rac ? readahead_count(rac) : 1;
> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> unsigned max_nr_pages = nr_pages;
> int ret = 0;
>
> + if (mapping_large_folio_support(mapping))
> + return f2fs_read_data_large_folio(inode, rac, folio);
> +
> #ifdef CONFIG_F2FS_FS_COMPRESSION
> if (f2fs_compressed_file(inode)) {
> index = rac ? readahead_index(rac) : folio->index;
> @@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> }
> #endif
> }
> - if (bio)
> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> return ret;
> }
>
> @@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> f2fs_remove_dirty_inode(inode);
> }
> }
> - folio_detach_private(folio);
> +
> + if (offset || length != folio_size(folio))
> + return;
> +
> + folio_cancel_dirty(folio);
> + ffs_detach_free(folio);
> }
>
> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> @@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> if (folio_test_dirty(folio))
> return false;
>
> - folio_detach_private(folio);
> + ffs_detach_free(folio);
> return true;
> }
>
> @@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
> {
> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> sizeof(struct bio_entry));
> - return bio_entry_slab ? 0 : -ENOMEM;
> +
> + if (!bio_entry_slab)
> + return -ENOMEM;
> +
> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> + sizeof(struct f2fs_folio_state));
> +
> + if (!ffs_entry_slab) {
> + kmem_cache_destroy(bio_entry_slab);
> + return -ENOMEM;
> + }
> +
> + return 0;
> }
>
> void f2fs_destroy_bio_entry_cache(void)
> {
> kmem_cache_destroy(bio_entry_slab);
> + kmem_cache_destroy(ffs_entry_slab);
> }
>
> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 5f104518c414..71adfacaca45 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -4921,6 +4921,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> return false;
> }
>
> +static inline bool f2fs_quota_file(struct inode *inode)
> +{
> +#ifdef CONFIG_QUOTA
> + int i;
> +
> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> + return false;
> +
> + for (i = 0; i < MAXQUOTAS; i++) {
> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> + return true;
> + }
> +#endif
> + return false;
> +}
> +
> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> {
> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index d7047ca6b98d..e75e61ac50d7 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
> if (!f2fs_is_compress_backend_ready(inode))
> return -EOPNOTSUPP;
>
> + if (mapping_large_folio_support(inode->i_mapping) &&
> + filp->f_mode & FMODE_WRITE)
> + return -EOPNOTSUPP;
> +
> err = fsverity_file_open(inode, filp);
> if (err)
> return err;
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index e2405b79b3cc..9162154d5211 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> if (ret)
> goto bad_inode;
> make_now:
> + f2fs_set_inode_flags(inode);
> +
> if (ino == F2FS_NODE_INO(sbi)) {
> inode->i_mapping->a_ops = &f2fs_node_aops;
> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> inode->i_op = &f2fs_file_inode_operations;
> inode->i_fop = &f2fs_file_operations;
> inode->i_mapping->a_ops = &f2fs_dblock_aops;
> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> + !f2fs_quota_file(inode))
> + mapping_set_folio_min_order(inode->i_mapping, 0);
> } else if (S_ISDIR(inode->i_mode)) {
> inode->i_op = &f2fs_dir_inode_operations;
> inode->i_fop = &f2fs_dir_operations;
> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> ret = -EIO;
> goto bad_inode;
> }
> - f2fs_set_inode_flags(inode);
>
> unlock_new_inode(inode);
> trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v3] f2fs: support large folio for immutable non-compressed case
2025-12-01 19:31 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
2025-12-01 21:37 ` Chao Yu
@ 2025-12-01 22:30 ` Jaegeuk Kim
2025-12-01 22:37 ` Chao Yu
2025-12-02 2:38 ` [f2fs-dev] [PATCH 1/2 v4] " Jaegeuk Kim
1 sibling, 2 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 22:30 UTC (permalink / raw)
To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel
This patch enables large folio for limited case where we can get the high-order
memory allocation. It supports the encrypted and fsverity files, which are
essential for Android environment.
How to test:
- dd if=/dev/zero of=/mnt/test/test bs=1G count=4
- f2fs_io setflags immutable /mnt/test/test
- echo 3 > /proc/sys/vm/drop_caches
: to reload inode with large folio
- f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
Change log from v2:
- add a doc
Change log from v1:
- return error when trying open an inode having large folio
Documentation/filesystems/f2fs.rst | 27 ++++
fs/f2fs/data.c | 247 +++++++++++++++++++++++++++--
fs/f2fs/f2fs.h | 16 ++
fs/f2fs/file.c | 4 +
fs/f2fs/inode.c | 6 +-
5 files changed, 290 insertions(+), 10 deletions(-)
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index cb90d1ae82d0..085142f4d085 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -1033,3 +1033,30 @@ the reserved space back to F2FS for its own use.
So, the key idea is, user can do any file operations on /dev/vdc, and
reclaim the space after the use, while the space is counted as /data.
That doesn't require modifying partition size and filesystem format.
+
+Per-file Read-Only Large Folio Support
+--------------------------------------
+
+F2FS implements large folio support on the read path to leverage high-order
+page allocation for significant performance gains. To minimize code complexity,
+this support is currently excluded from the write path, which requires handling
+complex optimizations such as compression and block allocation modes.
+
+This feature is optional and is activated only when the immutable bit is set on
+a file. The following example demonstrates the usage flow:
+
+.. code-block::
+
+ # f2fs_io setflags immutable /data/testfile_read_seq
+
+ /* mmap(MAP_POPULATE) + mlock() */
+ # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
+
+ /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
+ # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
+
+ /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
+ # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
+
+ # f2fs_io clearflags immutable /data/testfile_read_seq
+ # rm /data/testfile_read_seq
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 48c20386f031..acc4ef511bfb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio->bi_status = BLK_STS_IOERR;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (!IS_IMMUTABLE(inode))
+ return -EOPNOTSUPP;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 86785068554f..d7600979218e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4928,6 +4928,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d7047ca6b98d..e75e61ac50d7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
+ if (mapping_large_folio_support(inode->i_mapping) &&
+ filp->f_mode & FMODE_WRITE)
+ return -EOPNOTSUPP;
+
err = fsverity_file_open(inode, filp);
if (err)
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e2405b79b3cc..9162154d5211 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(inode))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v3] f2fs: support large folio for immutable non-compressed case
2025-12-01 22:30 ` [f2fs-dev] [PATCH 1/2 v3] " Jaegeuk Kim
@ 2025-12-01 22:37 ` Chao Yu
2025-12-02 2:38 ` [f2fs-dev] [PATCH 1/2 v4] " Jaegeuk Kim
1 sibling, 0 replies; 20+ messages in thread
From: Chao Yu @ 2025-12-01 22:37 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: chao, linux-kernel, linux-f2fs-devel
On 2025/12/2 06:30, Jaegeuk Kim wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
>
> Change log from v2:
> - add a doc
>
> Change log from v1:
> - return error when trying open an inode having large folio
>
> Documentation/filesystems/f2fs.rst | 27 ++++
> fs/f2fs/data.c | 247 +++++++++++++++++++++++++++--
> fs/f2fs/f2fs.h | 16 ++
> fs/f2fs/file.c | 4 +
> fs/f2fs/inode.c | 6 +-
> 5 files changed, 290 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
> index cb90d1ae82d0..085142f4d085 100644
> --- a/Documentation/filesystems/f2fs.rst
> +++ b/Documentation/filesystems/f2fs.rst
> @@ -1033,3 +1033,30 @@ the reserved space back to F2FS for its own use.
> So, the key idea is, user can do any file operations on /dev/vdc, and
> reclaim the space after the use, while the space is counted as /data.
> That doesn't require modifying partition size and filesystem format.
> +
> +Per-file Read-Only Large Folio Support
> +--------------------------------------
> +
> +F2FS implements large folio support on the read path to leverage high-order
> +page allocation for significant performance gains. To minimize code complexity,
> +this support is currently excluded from the write path, which requires handling
> +complex optimizations such as compression and block allocation modes.
> +
> +This feature is optional and is activated only when the immutable bit is set on
> +a file. The following example demonstrates the usage flow:
> +
> +.. code-block::
> +
> + # f2fs_io setflags immutable /data/testfile_read_seq
Missing a step to drop inode cache? as it expects to enable large folio in later
f2fs_iget().
Otherwise, the patch looks good to me.
Thanks,
> +
> + /* mmap(MAP_POPULATE) + mlock() */
> + # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
> +
> + /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
> + # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
> +
> + /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
> + # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
> +
> + # f2fs_io clearflags immutable /data/testfile_read_seq
> + # rm /data/testfile_read_seq
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 48c20386f031..acc4ef511bfb 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -31,9 +31,15 @@
>
> static struct kmem_cache *bio_post_read_ctx_cache;
> static struct kmem_cache *bio_entry_slab;
> +static struct kmem_cache *ffs_entry_slab;
> static mempool_t *bio_post_read_ctx_pool;
> static struct bio_set f2fs_bioset;
>
> +struct f2fs_folio_state {
> + spinlock_t state_lock;
> + unsigned int read_pages_pending;
> +};
> +
> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>
> int __init f2fs_init_bioset(void)
> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> {
> struct folio_iter fi;
> struct bio_post_read_ctx *ctx = bio->bi_private;
> + unsigned long flags;
>
> bio_for_each_folio_all(fi, bio) {
> struct folio *folio = fi.folio;
> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> + bool finished = true;
>
> - if (f2fs_is_compressed_page(folio)) {
> + if (!folio_test_large(folio) &&
> + f2fs_is_compressed_page(folio)) {
> if (ctx && !ctx->decompression_attempted)
> f2fs_end_read_compressed_page(folio, true, 0,
> in_task);
> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> bio->bi_status = BLK_STS_IOERR;
> }
>
> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> + if (folio_test_large(folio)) {
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + spin_lock_irqsave(&ffs->state_lock, flags);
> + ffs->read_pages_pending -= nr_pages;
> + finished = !ffs->read_pages_pending;
> + spin_unlock_irqrestore(&ffs->state_lock, flags);
> + }
> +
> + while (nr_pages--)
> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> +
> + if (finished)
> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> }
>
> if (ctx)
> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> enum page_type type)
> {
> + if (!bio)
> + return;
> +
> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>
> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> struct dnode_of_data dn;
> struct folio *folio;
> int err;
> -
> +retry:
> folio = f2fs_grab_cache_folio(mapping, index, for_write);
> if (IS_ERR(folio))
> return folio;
>
> + if (folio_test_large(folio)) {
> + pgoff_t folio_index = mapping_align_index(mapping, index);
> +
> + f2fs_folio_put(folio, true);
> + invalidate_inode_pages2_range(mapping, folio_index,
> + folio_index + folio_nr_pages(folio) - 1);
> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> + goto retry;
> + }
> +
> if (f2fs_lookup_read_extent_cache_block(inode, index,
> &dn.data_blkaddr)) {
> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> @@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> }
> #endif
>
> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + if (ffs)
> + return ffs;
> +
> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> +
> + spin_lock_init(&ffs->state_lock);
> + folio_attach_private(folio, ffs);
> + return ffs;
> +}
> +
> +static void ffs_detach_free(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs;
> +
> + if (!folio_test_large(folio)) {
> + folio_detach_private(folio);
> + return;
> + }
> +
> + ffs = folio_detach_private(folio);
> + if (!ffs)
> + return;
> +
> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> + kmem_cache_free(ffs_entry_slab, ffs);
> +}
> +
> +static int f2fs_read_data_large_folio(struct inode *inode,
> + struct readahead_control *rac, struct folio *folio)
> +{
> + struct bio *bio = NULL;
> + sector_t last_block_in_bio = 0;
> + struct f2fs_map_blocks map = {0, };
> + pgoff_t index, offset;
> + unsigned max_nr_pages = rac ? readahead_count(rac) :
> + folio_nr_pages(folio);
> + unsigned nrpages;
> + struct f2fs_folio_state *ffs;
> + int ret = 0;
> +
> + if (!IS_IMMUTABLE(inode))
> + return -EOPNOTSUPP;
> +
> + if (f2fs_compressed_file(inode))
> + return -EOPNOTSUPP;
> +
> + map.m_seg_type = NO_CHECK_TYPE;
> +
> + if (rac)
> + folio = readahead_folio(rac);
> +next_folio:
> + if (!folio)
> + goto out;
> +
> + index = folio->index;
> + offset = 0;
> + ffs = NULL;
> + nrpages = folio_nr_pages(folio);
> +
> + for (; nrpages; nrpages--) {
> + sector_t block_nr;
> + /*
> + * Map blocks using the previous result first.
> + */
> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> + index > map.m_lblk &&
> + index < (map.m_lblk + map.m_len))
> + goto got_it;
> +
> + /*
> + * Then do more f2fs_map_blocks() calls until we are
> + * done with this page.
> + */
> + memset(&map, 0, sizeof(map));
> + map.m_seg_type = NO_CHECK_TYPE;
> + map.m_lblk = index;
> + map.m_len = max_nr_pages;
> +
> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> + if (ret)
> + goto err_out;
> +got_it:
> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> + block_nr = map.m_pblk + index - map.m_lblk;
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> + DATA_GENERIC_ENHANCE_READ)) {
> + ret = -EFSCORRUPTED;
> + goto err_out;
> + }
> + } else {
> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> + if (f2fs_need_verity(inode, index) &&
> + !fsverity_verify_page(folio_file_page(folio,
> + index))) {
> + ret = -EIO;
> + goto err_out;
> + }
> + continue;
> + }
> +
> + /*
> + * This page will go to BIO. Do we need to send this
> + * BIO off first?
> + */
> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> + last_block_in_bio, block_nr) ||
> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> +submit_and_realloc:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + bio = NULL;
> + }
> + if (bio == NULL)
> + bio = f2fs_grab_read_bio(inode, block_nr,
> + max_nr_pages,
> + f2fs_ra_op_flags(rac),
> + index, false);
> +
> + /*
> + * If the page is under writeback, we need to wait for
> + * its completion to see the correct decrypted data.
> + */
> + f2fs_wait_on_block_writeback(inode, block_nr);
> +
> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> + offset << PAGE_SHIFT))
> + goto submit_and_realloc;
> +
> + if (folio_test_large(folio)) {
> + ffs = ffs_find_or_alloc(folio);
> +
> + /* set the bitmap to wait */
> + spin_lock_irq(&ffs->state_lock);
> + ffs->read_pages_pending++;
> + spin_unlock_irq(&ffs->state_lock);
> + }
> +
> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> + F2FS_BLKSIZE);
> + last_block_in_bio = block_nr;
> + index++;
> + offset++;
> + }
> + if (rac) {
> + folio = readahead_folio(rac);
> + goto next_folio;
> + }
> +err_out:
> + /* Nothing was submitted. */
> + if (!bio) {
> + if (!ret)
> + folio_mark_uptodate(folio);
> + folio_unlock(folio);
> + return ret;
> + }
> +
> + if (ret) {
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> +
> + /* Wait bios and clear uptodate. */
> + folio_lock(folio);
> + folio_clear_uptodate(folio);
> + folio_unlock(folio);
> + }
> +out:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + return ret;
> +}
> +
> /*
> * This function was originally taken from fs/mpage.c, and customized for f2fs.
> * Major change was from block_size == page_size in f2fs by default.
> @@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> pgoff_t index;
> #endif
> unsigned nr_pages = rac ? readahead_count(rac) : 1;
> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> unsigned max_nr_pages = nr_pages;
> int ret = 0;
>
> + if (mapping_large_folio_support(mapping))
> + return f2fs_read_data_large_folio(inode, rac, folio);
> +
> #ifdef CONFIG_F2FS_FS_COMPRESSION
> if (f2fs_compressed_file(inode)) {
> index = rac ? readahead_index(rac) : folio->index;
> @@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> }
> #endif
> }
> - if (bio)
> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> return ret;
> }
>
> @@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> f2fs_remove_dirty_inode(inode);
> }
> }
> - folio_detach_private(folio);
> +
> + if (offset || length != folio_size(folio))
> + return;
> +
> + folio_cancel_dirty(folio);
> + ffs_detach_free(folio);
> }
>
> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> @@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> if (folio_test_dirty(folio))
> return false;
>
> - folio_detach_private(folio);
> + ffs_detach_free(folio);
> return true;
> }
>
> @@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
> {
> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> sizeof(struct bio_entry));
> - return bio_entry_slab ? 0 : -ENOMEM;
> +
> + if (!bio_entry_slab)
> + return -ENOMEM;
> +
> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> + sizeof(struct f2fs_folio_state));
> +
> + if (!ffs_entry_slab) {
> + kmem_cache_destroy(bio_entry_slab);
> + return -ENOMEM;
> + }
> +
> + return 0;
> }
>
> void f2fs_destroy_bio_entry_cache(void)
> {
> kmem_cache_destroy(bio_entry_slab);
> + kmem_cache_destroy(ffs_entry_slab);
> }
>
> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 86785068554f..d7600979218e 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -4928,6 +4928,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> return false;
> }
>
> +static inline bool f2fs_quota_file(struct inode *inode)
> +{
> +#ifdef CONFIG_QUOTA
> + int i;
> +
> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> + return false;
> +
> + for (i = 0; i < MAXQUOTAS; i++) {
> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> + return true;
> + }
> +#endif
> + return false;
> +}
> +
> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> {
> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index d7047ca6b98d..e75e61ac50d7 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
> if (!f2fs_is_compress_backend_ready(inode))
> return -EOPNOTSUPP;
>
> + if (mapping_large_folio_support(inode->i_mapping) &&
> + filp->f_mode & FMODE_WRITE)
> + return -EOPNOTSUPP;
> +
> err = fsverity_file_open(inode, filp);
> if (err)
> return err;
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index e2405b79b3cc..9162154d5211 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> if (ret)
> goto bad_inode;
> make_now:
> + f2fs_set_inode_flags(inode);
> +
> if (ino == F2FS_NODE_INO(sbi)) {
> inode->i_mapping->a_ops = &f2fs_node_aops;
> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> inode->i_op = &f2fs_file_inode_operations;
> inode->i_fop = &f2fs_file_operations;
> inode->i_mapping->a_ops = &f2fs_dblock_aops;
> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> + !f2fs_quota_file(inode))
> + mapping_set_folio_min_order(inode->i_mapping, 0);
> } else if (S_ISDIR(inode->i_mode)) {
> inode->i_op = &f2fs_dir_inode_operations;
> inode->i_fop = &f2fs_dir_operations;
> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> ret = -EIO;
> goto bad_inode;
> }
> - f2fs_set_inode_flags(inode);
>
> unlock_new_inode(inode);
> trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2025-12-01 22:30 ` [f2fs-dev] [PATCH 1/2 v3] " Jaegeuk Kim
2025-12-01 22:37 ` Chao Yu
@ 2025-12-02 2:38 ` Jaegeuk Kim
2025-12-02 18:07 ` Chao Yu
2025-12-09 8:32 ` Chao Yu
1 sibling, 2 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2025-12-02 2:38 UTC (permalink / raw)
To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel
This patch enables large folio for limited case where we can get the high-order
memory allocation. It supports the encrypted and fsverity files, which are
essential for Android environment.
How to test:
- dd if=/dev/zero of=/mnt/test/test bs=1G count=4
- f2fs_io setflags immutable /mnt/test/test
- echo 3 > /proc/sys/vm/drop_caches
: to reload inode with large folio
- f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
Change log from v3:
- enhance the doc
Change log from v2:
- add a doc
Change log from v1:
- return error when trying open an inode having large folio
Documentation/filesystems/f2fs.rst | 43 +++++
fs/f2fs/data.c | 247 +++++++++++++++++++++++++++--
fs/f2fs/f2fs.h | 16 ++
fs/f2fs/file.c | 4 +
fs/f2fs/inode.c | 6 +-
5 files changed, 306 insertions(+), 10 deletions(-)
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index cb90d1ae82d0..9b3b835a174e 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -1033,3 +1033,46 @@ the reserved space back to F2FS for its own use.
So, the key idea is, user can do any file operations on /dev/vdc, and
reclaim the space after the use, while the space is counted as /data.
That doesn't require modifying partition size and filesystem format.
+
+Per-file Read-Only Large Folio Support
+--------------------------------------
+
+F2FS implements large folio support on the read path to leverage high-order
+page allocation for significant performance gains. To minimize code complexity,
+this support is currently excluded from the write path, which requires handling
+complex optimizations such as compression and block allocation modes.
+
+This optional feature is triggered only when a file's immutable bit is set.
+Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
+file with write permissions, even immediately after clearing the bit. Write
+access is only restored once the cached inode is dropped. The usage flow is
+demonstrated below:
+
+.. code-block::
+
+ # f2fs_io setflags immutable /data/testfile_read_seq
+
+ /* flush and reload the inode to enable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ /* mmap(MAP_POPULATE) + mlock() */
+ # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
+
+ /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
+ # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
+
+ /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
+ # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
+
+ # f2fs_io clearflags immutable /data/testfile_read_seq
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Failed to open /mnt/test/test: Operation not supported
+
+ /* flush and reload the inode to disable the large folio */
+ # sync && echo 3 > /proc/sys/vm/drop_caches
+
+ # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
+ Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
+
+ # rm /data/testfile_read_seq
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 48c20386f031..acc4ef511bfb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,9 +31,15 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *ffs_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static struct bio_set f2fs_bioset;
+struct f2fs_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_pages_pending;
+};
+
#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
int __init f2fs_init_bioset(void)
@@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct folio_iter fi;
struct bio_post_read_ctx *ctx = bio->bi_private;
+ unsigned long flags;
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
+ unsigned nr_pages = fi.length >> PAGE_SHIFT;
+ bool finished = true;
- if (f2fs_is_compressed_page(folio)) {
+ if (!folio_test_large(folio) &&
+ f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(folio, true, 0,
in_task);
@@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio->bi_status = BLK_STS_IOERR;
}
- dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == BLK_STS_OK);
+ if (folio_test_large(folio)) {
+ struct f2fs_folio_state *ffs = folio->private;
+
+ spin_lock_irqsave(&ffs->state_lock, flags);
+ ffs->read_pages_pending -= nr_pages;
+ finished = !ffs->read_pages_pending;
+ spin_unlock_irqrestore(&ffs->state_lock, flags);
+ }
+
+ while (nr_pages--)
+ dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
+
+ if (finished)
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
enum page_type type)
{
+ if (!bio)
+ return;
+
WARN_ON_ONCE(!is_read_io(bio_op(bio)));
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
@@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
struct folio *folio;
int err;
-
+retry:
folio = f2fs_grab_cache_folio(mapping, index, for_write);
if (IS_ERR(folio))
return folio;
+ if (folio_test_large(folio)) {
+ pgoff_t folio_index = mapping_align_index(mapping, index);
+
+ f2fs_folio_put(folio, true);
+ invalidate_inode_pages2_range(mapping, folio_index,
+ folio_index + folio_nr_pages(folio) - 1);
+ f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
+ goto retry;
+ }
+
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
@@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
}
#endif
+static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs = folio->private;
+
+ if (ffs)
+ return ffs;
+
+ ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
+
+ spin_lock_init(&ffs->state_lock);
+ folio_attach_private(folio, ffs);
+ return ffs;
+}
+
+static void ffs_detach_free(struct folio *folio)
+{
+ struct f2fs_folio_state *ffs;
+
+ if (!folio_test_large(folio)) {
+ folio_detach_private(folio);
+ return;
+ }
+
+ ffs = folio_detach_private(folio);
+ if (!ffs)
+ return;
+
+ WARN_ON_ONCE(ffs->read_pages_pending != 0);
+ kmem_cache_free(ffs_entry_slab, ffs);
+}
+
+static int f2fs_read_data_large_folio(struct inode *inode,
+ struct readahead_control *rac, struct folio *folio)
+{
+ struct bio *bio = NULL;
+ sector_t last_block_in_bio = 0;
+ struct f2fs_map_blocks map = {0, };
+ pgoff_t index, offset;
+ unsigned max_nr_pages = rac ? readahead_count(rac) :
+ folio_nr_pages(folio);
+ unsigned nrpages;
+ struct f2fs_folio_state *ffs;
+ int ret = 0;
+
+ if (!IS_IMMUTABLE(inode))
+ return -EOPNOTSUPP;
+
+ if (f2fs_compressed_file(inode))
+ return -EOPNOTSUPP;
+
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (rac)
+ folio = readahead_folio(rac);
+next_folio:
+ if (!folio)
+ goto out;
+
+ index = folio->index;
+ offset = 0;
+ ffs = NULL;
+ nrpages = folio_nr_pages(folio);
+
+ for (; nrpages; nrpages--) {
+ sector_t block_nr;
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ index > map.m_lblk &&
+ index < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ memset(&map, 0, sizeof(map));
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_lblk = index;
+ map.m_len = max_nr_pages;
+
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
+ if (ret)
+ goto err_out;
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + index - map.m_lblk;
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
+ DATA_GENERIC_ENHANCE_READ)) {
+ ret = -EFSCORRUPTED;
+ goto err_out;
+ }
+ } else {
+ folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
+ if (f2fs_need_verity(inode, index) &&
+ !fsverity_verify_page(folio_file_page(folio,
+ index))) {
+ ret = -EIO;
+ goto err_out;
+ }
+ continue;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
+ last_block_in_bio, block_nr) ||
+ !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
+submit_and_realloc:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL)
+ bio = f2fs_grab_read_bio(inode, block_nr,
+ max_nr_pages,
+ f2fs_ra_op_flags(rac),
+ index, false);
+
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
+ if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
+ offset << PAGE_SHIFT))
+ goto submit_and_realloc;
+
+ if (folio_test_large(folio)) {
+ ffs = ffs_find_or_alloc(folio);
+
+ /* set the bitmap to wait */
+ spin_lock_irq(&ffs->state_lock);
+ ffs->read_pages_pending++;
+ spin_unlock_irq(&ffs->state_lock);
+ }
+
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
+ F2FS_BLKSIZE);
+ last_block_in_bio = block_nr;
+ index++;
+ offset++;
+ }
+ if (rac) {
+ folio = readahead_folio(rac);
+ goto next_folio;
+ }
+err_out:
+ /* Nothing was submitted. */
+ if (!bio) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return ret;
+ }
+
+ if (ret) {
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+
+ /* Wait bios and clear uptodate. */
+ folio_lock(folio);
+ folio_clear_uptodate(folio);
+ folio_unlock(folio);
+ }
+out:
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return ret;
+}
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
pgoff_t index;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
+ struct address_space *mapping = rac ? rac->mapping : folio->mapping;
unsigned max_nr_pages = nr_pages;
int ret = 0;
+ if (mapping_large_folio_support(mapping))
+ return f2fs_read_data_large_folio(inode, rac, folio);
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
index = rac ? readahead_index(rac) : folio->index;
@@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#endif
}
- if (bio)
- f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- folio_detach_private(folio);
+
+ if (offset || length != folio_size(folio))
+ return;
+
+ folio_cancel_dirty(folio);
+ ffs_detach_free(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- folio_detach_private(folio);
+ ffs_detach_free(folio);
return true;
}
@@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- return bio_entry_slab ? 0 : -ENOMEM;
+
+ if (!bio_entry_slab)
+ return -ENOMEM;
+
+ ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
+ sizeof(struct f2fs_folio_state));
+
+ if (!ffs_entry_slab) {
+ kmem_cache_destroy(bio_entry_slab);
+ return -ENOMEM;
+ }
+
+ return 0;
}
void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(ffs_entry_slab);
}
static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 86785068554f..d7600979218e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4928,6 +4928,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_quota_file(struct inode *inode)
+{
+#ifdef CONFIG_QUOTA
+ int i;
+
+ if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
+ return false;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
+ return true;
+ }
+#endif
+ return false;
+}
+
static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
{
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d7047ca6b98d..e75e61ac50d7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
+ if (mapping_large_folio_support(inode->i_mapping) &&
+ filp->f_mode & FMODE_WRITE)
+ return -EOPNOTSUPP;
+
err = fsverity_file_open(inode, filp);
if (err)
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e2405b79b3cc..9162154d5211 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ret)
goto bad_inode;
make_now:
+ f2fs_set_inode_flags(inode);
+
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
+ !f2fs_quota_file(inode))
+ mapping_set_folio_min_order(inode->i_mapping, 0);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = -EIO;
goto bad_inode;
}
- f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2025-12-02 2:38 ` [f2fs-dev] [PATCH 1/2 v4] " Jaegeuk Kim
@ 2025-12-02 18:07 ` Chao Yu
2025-12-09 8:32 ` Chao Yu
1 sibling, 0 replies; 20+ messages in thread
From: Chao Yu @ 2025-12-02 18:07 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: chao, linux-kernel, linux-f2fs-devel
On 2025/12/2 10:38, Jaegeuk Kim wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Thanks,
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2025-12-02 2:38 ` [f2fs-dev] [PATCH 1/2 v4] " Jaegeuk Kim
2025-12-02 18:07 ` Chao Yu
@ 2025-12-09 8:32 ` Chao Yu
2025-12-09 18:38 ` Jaegeuk Kim
1 sibling, 1 reply; 20+ messages in thread
From: Chao Yu @ 2025-12-09 8:32 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: chao, linux-kernel, linux-f2fs-devel
On 12/2/25 10:38, Jaegeuk Kim wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> Change log from v3:
> - enhance the doc
>
> Change log from v2:
> - add a doc
>
> Change log from v1:
> - return error when trying open an inode having large folio
>
> Documentation/filesystems/f2fs.rst | 43 +++++
> fs/f2fs/data.c | 247 +++++++++++++++++++++++++++--
> fs/f2fs/f2fs.h | 16 ++
> fs/f2fs/file.c | 4 +
> fs/f2fs/inode.c | 6 +-
> 5 files changed, 306 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
> index cb90d1ae82d0..9b3b835a174e 100644
> --- a/Documentation/filesystems/f2fs.rst
> +++ b/Documentation/filesystems/f2fs.rst
> @@ -1033,3 +1033,46 @@ the reserved space back to F2FS for its own use.
> So, the key idea is, user can do any file operations on /dev/vdc, and
> reclaim the space after the use, while the space is counted as /data.
> That doesn't require modifying partition size and filesystem format.
> +
> +Per-file Read-Only Large Folio Support
> +--------------------------------------
> +
> +F2FS implements large folio support on the read path to leverage high-order
> +page allocation for significant performance gains. To minimize code complexity,
> +this support is currently excluded from the write path, which requires handling
> +complex optimizations such as compression and block allocation modes.
> +
> +This optional feature is triggered only when a file's immutable bit is set.
> +Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
> +file with write permissions, even immediately after clearing the bit. Write
> +access is only restored once the cached inode is dropped. The usage flow is
> +demonstrated below:
> +
> +.. code-block::
> +
> + # f2fs_io setflags immutable /data/testfile_read_seq
> +
> + /* flush and reload the inode to enable the large folio */
> + # sync && echo 3 > /proc/sys/vm/drop_caches
> +
> + /* mmap(MAP_POPULATE) + mlock() */
> + # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
> +
> + /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
> + # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
> +
> + /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
> + # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
> +
> + # f2fs_io clearflags immutable /data/testfile_read_seq
> +
> + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
> + Failed to open /mnt/test/test: Operation not supported
> +
> + /* flush and reload the inode to disable the large folio */
> + # sync && echo 3 > /proc/sys/vm/drop_caches
> +
> + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
> + Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
> +
> + # rm /data/testfile_read_seq
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 48c20386f031..acc4ef511bfb 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -31,9 +31,15 @@
>
> static struct kmem_cache *bio_post_read_ctx_cache;
> static struct kmem_cache *bio_entry_slab;
> +static struct kmem_cache *ffs_entry_slab;
> static mempool_t *bio_post_read_ctx_pool;
> static struct bio_set f2fs_bioset;
>
> +struct f2fs_folio_state {
> + spinlock_t state_lock;
> + unsigned int read_pages_pending;
> +};
> +
> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>
> int __init f2fs_init_bioset(void)
> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> {
> struct folio_iter fi;
> struct bio_post_read_ctx *ctx = bio->bi_private;
> + unsigned long flags;
>
> bio_for_each_folio_all(fi, bio) {
> struct folio *folio = fi.folio;
> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> + bool finished = true;
>
> - if (f2fs_is_compressed_page(folio)) {
> + if (!folio_test_large(folio) &&
> + f2fs_is_compressed_page(folio)) {
> if (ctx && !ctx->decompression_attempted)
> f2fs_end_read_compressed_page(folio, true, 0,
> in_task);
> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> bio->bi_status = BLK_STS_IOERR;
> }
>
> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> + if (folio_test_large(folio)) {
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + spin_lock_irqsave(&ffs->state_lock, flags);
> + ffs->read_pages_pending -= nr_pages;
> + finished = !ffs->read_pages_pending;
> + spin_unlock_irqrestore(&ffs->state_lock, flags);
> + }
> +
> + while (nr_pages--)
> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> +
> + if (finished)
> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> }
>
> if (ctx)
> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> enum page_type type)
> {
> + if (!bio)
> + return;
> +
> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>
> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> struct dnode_of_data dn;
> struct folio *folio;
> int err;
> -
> +retry:
> folio = f2fs_grab_cache_folio(mapping, index, for_write);
> if (IS_ERR(folio))
> return folio;
>
> + if (folio_test_large(folio)) {
> + pgoff_t folio_index = mapping_align_index(mapping, index);
> +
> + f2fs_folio_put(folio, true);
> + invalidate_inode_pages2_range(mapping, folio_index,
> + folio_index + folio_nr_pages(folio) - 1);
> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> + goto retry;
> + }
> +
> if (f2fs_lookup_read_extent_cache_block(inode, index,
> &dn.data_blkaddr)) {
> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> @@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> }
> #endif
>
> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + if (ffs)
> + return ffs;
> +
> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> +
> + spin_lock_init(&ffs->state_lock);
> + folio_attach_private(folio, ffs);
> + return ffs;
> +}
> +
> +static void ffs_detach_free(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs;
> +
> + if (!folio_test_large(folio)) {
> + folio_detach_private(folio);
> + return;
> + }
> +
> + ffs = folio_detach_private(folio);
> + if (!ffs)
> + return;
> +
> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> + kmem_cache_free(ffs_entry_slab, ffs);
> +}
> +
> +static int f2fs_read_data_large_folio(struct inode *inode,
> + struct readahead_control *rac, struct folio *folio)
> +{
> + struct bio *bio = NULL;
> + sector_t last_block_in_bio = 0;
> + struct f2fs_map_blocks map = {0, };
> + pgoff_t index, offset;
> + unsigned max_nr_pages = rac ? readahead_count(rac) :
> + folio_nr_pages(folio);
> + unsigned nrpages;
> + struct f2fs_folio_state *ffs;
> + int ret = 0;
> +
> + if (!IS_IMMUTABLE(inode))
> + return -EOPNOTSUPP;
> +
> + if (f2fs_compressed_file(inode))
> + return -EOPNOTSUPP;
> +
> + map.m_seg_type = NO_CHECK_TYPE;
> +
> + if (rac)
> + folio = readahead_folio(rac);
> +next_folio:
> + if (!folio)
> + goto out;
> +
> + index = folio->index;
> + offset = 0;
> + ffs = NULL;
> + nrpages = folio_nr_pages(folio);
> +
> + for (; nrpages; nrpages--) {
> + sector_t block_nr;
> + /*
> + * Map blocks using the previous result first.
> + */
> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> + index > map.m_lblk &&
> + index < (map.m_lblk + map.m_len))
> + goto got_it;
> +
> + /*
> + * Then do more f2fs_map_blocks() calls until we are
> + * done with this page.
> + */
> + memset(&map, 0, sizeof(map));
> + map.m_seg_type = NO_CHECK_TYPE;
> + map.m_lblk = index;
> + map.m_len = max_nr_pages;
> +
> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> + if (ret)
> + goto err_out;
> +got_it:
> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> + block_nr = map.m_pblk + index - map.m_lblk;
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> + DATA_GENERIC_ENHANCE_READ)) {
> + ret = -EFSCORRUPTED;
> + goto err_out;
> + }
> + } else {
> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> + if (f2fs_need_verity(inode, index) &&
> + !fsverity_verify_page(folio_file_page(folio,
> + index))) {
> + ret = -EIO;
> + goto err_out;
> + }
> + continue;
> + }
> +
> + /*
> + * This page will go to BIO. Do we need to send this
> + * BIO off first?
> + */
> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> + last_block_in_bio, block_nr) ||
> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> +submit_and_realloc:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + bio = NULL;
> + }
> + if (bio == NULL)
> + bio = f2fs_grab_read_bio(inode, block_nr,
> + max_nr_pages,
> + f2fs_ra_op_flags(rac),
> + index, false);
> +
> + /*
> + * If the page is under writeback, we need to wait for
> + * its completion to see the correct decrypted data.
> + */
> + f2fs_wait_on_block_writeback(inode, block_nr);
> +
> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> + offset << PAGE_SHIFT))
What do you think of adding physical contiguous folio in batch? I guess we can
reduce loop number of calling several functions here?
Thanks,
> + goto submit_and_realloc;
> +
> + if (folio_test_large(folio)) {
> + ffs = ffs_find_or_alloc(folio);
> +
> + /* set the bitmap to wait */
> + spin_lock_irq(&ffs->state_lock);
> + ffs->read_pages_pending++;
> + spin_unlock_irq(&ffs->state_lock);
> + }
> +
> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> + F2FS_BLKSIZE);
> + last_block_in_bio = block_nr;
> + index++;
> + offset++;
> + }
> + if (rac) {
> + folio = readahead_folio(rac);
> + goto next_folio;
> + }
> +err_out:
> + /* Nothing was submitted. */
> + if (!bio) {
> + if (!ret)
> + folio_mark_uptodate(folio);
> + folio_unlock(folio);
> + return ret;
> + }
> +
> + if (ret) {
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> +
> + /* Wait bios and clear uptodate. */
> + folio_lock(folio);
> + folio_clear_uptodate(folio);
> + folio_unlock(folio);
> + }
> +out:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + return ret;
> +}
> +
> /*
> * This function was originally taken from fs/mpage.c, and customized for f2fs.
> * Major change was from block_size == page_size in f2fs by default.
> @@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> pgoff_t index;
> #endif
> unsigned nr_pages = rac ? readahead_count(rac) : 1;
> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> unsigned max_nr_pages = nr_pages;
> int ret = 0;
>
> + if (mapping_large_folio_support(mapping))
> + return f2fs_read_data_large_folio(inode, rac, folio);
> +
> #ifdef CONFIG_F2FS_FS_COMPRESSION
> if (f2fs_compressed_file(inode)) {
> index = rac ? readahead_index(rac) : folio->index;
> @@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> }
> #endif
> }
> - if (bio)
> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> return ret;
> }
>
> @@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> f2fs_remove_dirty_inode(inode);
> }
> }
> - folio_detach_private(folio);
> +
> + if (offset || length != folio_size(folio))
> + return;
> +
> + folio_cancel_dirty(folio);
> + ffs_detach_free(folio);
> }
>
> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> @@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> if (folio_test_dirty(folio))
> return false;
>
> - folio_detach_private(folio);
> + ffs_detach_free(folio);
> return true;
> }
>
> @@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
> {
> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> sizeof(struct bio_entry));
> - return bio_entry_slab ? 0 : -ENOMEM;
> +
> + if (!bio_entry_slab)
> + return -ENOMEM;
> +
> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> + sizeof(struct f2fs_folio_state));
> +
> + if (!ffs_entry_slab) {
> + kmem_cache_destroy(bio_entry_slab);
> + return -ENOMEM;
> + }
> +
> + return 0;
> }
>
> void f2fs_destroy_bio_entry_cache(void)
> {
> kmem_cache_destroy(bio_entry_slab);
> + kmem_cache_destroy(ffs_entry_slab);
> }
>
> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 86785068554f..d7600979218e 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -4928,6 +4928,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> return false;
> }
>
> +static inline bool f2fs_quota_file(struct inode *inode)
> +{
> +#ifdef CONFIG_QUOTA
> + int i;
> +
> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> + return false;
> +
> + for (i = 0; i < MAXQUOTAS; i++) {
> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> + return true;
> + }
> +#endif
> + return false;
> +}
> +
> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> {
> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index d7047ca6b98d..e75e61ac50d7 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
> if (!f2fs_is_compress_backend_ready(inode))
> return -EOPNOTSUPP;
>
> + if (mapping_large_folio_support(inode->i_mapping) &&
> + filp->f_mode & FMODE_WRITE)
> + return -EOPNOTSUPP;
> +
> err = fsverity_file_open(inode, filp);
> if (err)
> return err;
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index e2405b79b3cc..9162154d5211 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> if (ret)
> goto bad_inode;
> make_now:
> + f2fs_set_inode_flags(inode);
> +
> if (ino == F2FS_NODE_INO(sbi)) {
> inode->i_mapping->a_ops = &f2fs_node_aops;
> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> inode->i_op = &f2fs_file_inode_operations;
> inode->i_fop = &f2fs_file_operations;
> inode->i_mapping->a_ops = &f2fs_dblock_aops;
> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> + !f2fs_quota_file(inode))
> + mapping_set_folio_min_order(inode->i_mapping, 0);
> } else if (S_ISDIR(inode->i_mode)) {
> inode->i_op = &f2fs_dir_inode_operations;
> inode->i_fop = &f2fs_dir_operations;
> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> ret = -EIO;
> goto bad_inode;
> }
> - f2fs_set_inode_flags(inode);
>
> unlock_new_inode(inode);
> trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2025-12-09 8:32 ` Chao Yu
@ 2025-12-09 18:38 ` Jaegeuk Kim
2026-01-01 11:20 ` Nanzhe Zhao
0 siblings, 1 reply; 20+ messages in thread
From: Jaegeuk Kim @ 2025-12-09 18:38 UTC (permalink / raw)
To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel
On 12/09, Chao Yu wrote:
> On 12/2/25 10:38, Jaegeuk Kim wrote:
> > This patch enables large folio for limited case where we can get the high-order
> > memory allocation. It supports the encrypted and fsverity files, which are
> > essential for Android environment.
> >
> > How to test:
> > - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> > - f2fs_io setflags immutable /mnt/test/test
> > - echo 3 > /proc/sys/vm/drop_caches
> > : to reload inode with large folio
> > - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
> >
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> > Change log from v3:
> > - enhance the doc
> >
> > Change log from v2:
> > - add a doc
> >
> > Change log from v1:
> > - return error when trying open an inode having large folio
> >
> > Documentation/filesystems/f2fs.rst | 43 +++++
> > fs/f2fs/data.c | 247 +++++++++++++++++++++++++++--
> > fs/f2fs/f2fs.h | 16 ++
> > fs/f2fs/file.c | 4 +
> > fs/f2fs/inode.c | 6 +-
> > 5 files changed, 306 insertions(+), 10 deletions(-)
> >
> > diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
> > index cb90d1ae82d0..9b3b835a174e 100644
> > --- a/Documentation/filesystems/f2fs.rst
> > +++ b/Documentation/filesystems/f2fs.rst
> > @@ -1033,3 +1033,46 @@ the reserved space back to F2FS for its own use.
> > So, the key idea is, user can do any file operations on /dev/vdc, and
> > reclaim the space after the use, while the space is counted as /data.
> > That doesn't require modifying partition size and filesystem format.
> > +
> > +Per-file Read-Only Large Folio Support
> > +--------------------------------------
> > +
> > +F2FS implements large folio support on the read path to leverage high-order
> > +page allocation for significant performance gains. To minimize code complexity,
> > +this support is currently excluded from the write path, which requires handling
> > +complex optimizations such as compression and block allocation modes.
> > +
> > +This optional feature is triggered only when a file's immutable bit is set.
> > +Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
> > +file with write permissions, even immediately after clearing the bit. Write
> > +access is only restored once the cached inode is dropped. The usage flow is
> > +demonstrated below:
> > +
> > +.. code-block::
> > +
> > + # f2fs_io setflags immutable /data/testfile_read_seq
> > +
> > + /* flush and reload the inode to enable the large folio */
> > + # sync && echo 3 > /proc/sys/vm/drop_caches
> > +
> > + /* mmap(MAP_POPULATE) + mlock() */
> > + # f2fs_io read 128 0 1024 mmap 1 0 /data/testfile_read_seq
> > +
> > + /* mmap() + fadvise(POSIX_FADV_WILLNEED) + mlock() */
> > + # f2fs_io read 128 0 1024 fadvise 1 0 /data/testfile_read_seq
> > +
> > + /* mmap() + mlock2(MLOCK_ONFAULT) + madvise(MADV_POPULATE_READ) */
> > + # f2fs_io read 128 0 1024 madvise 1 0 /data/testfile_read_seq
> > +
> > + # f2fs_io clearflags immutable /data/testfile_read_seq
> > +
> > + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
> > + Failed to open /mnt/test/test: Operation not supported
> > +
> > + /* flush and reload the inode to disable the large folio */
> > + # sync && echo 3 > /proc/sys/vm/drop_caches
> > +
> > + # f2fs_io write 1 0 1 zero buffered /data/testfile_read_seq
> > + Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
> > +
> > + # rm /data/testfile_read_seq
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 48c20386f031..acc4ef511bfb 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -31,9 +31,15 @@
> >
> > static struct kmem_cache *bio_post_read_ctx_cache;
> > static struct kmem_cache *bio_entry_slab;
> > +static struct kmem_cache *ffs_entry_slab;
> > static mempool_t *bio_post_read_ctx_pool;
> > static struct bio_set f2fs_bioset;
> >
> > +struct f2fs_folio_state {
> > + spinlock_t state_lock;
> > + unsigned int read_pages_pending;
> > +};
> > +
> > #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
> >
> > int __init f2fs_init_bioset(void)
> > @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> > {
> > struct folio_iter fi;
> > struct bio_post_read_ctx *ctx = bio->bi_private;
> > + unsigned long flags;
> >
> > bio_for_each_folio_all(fi, bio) {
> > struct folio *folio = fi.folio;
> > + unsigned nr_pages = fi.length >> PAGE_SHIFT;
> > + bool finished = true;
> >
> > - if (f2fs_is_compressed_page(folio)) {
> > + if (!folio_test_large(folio) &&
> > + f2fs_is_compressed_page(folio)) {
> > if (ctx && !ctx->decompression_attempted)
> > f2fs_end_read_compressed_page(folio, true, 0,
> > in_task);
> > @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
> > bio->bi_status = BLK_STS_IOERR;
> > }
> >
> > - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> > - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> > + if (folio_test_large(folio)) {
> > + struct f2fs_folio_state *ffs = folio->private;
> > +
> > + spin_lock_irqsave(&ffs->state_lock, flags);
> > + ffs->read_pages_pending -= nr_pages;
> > + finished = !ffs->read_pages_pending;
> > + spin_unlock_irqrestore(&ffs->state_lock, flags);
> > + }
> > +
> > + while (nr_pages--)
> > + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
> > +
> > + if (finished)
> > + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
> > }
> >
> > if (ctx)
> > @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
> > void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
> > enum page_type type)
> > {
> > + if (!bio)
> > + return;
> > +
> > WARN_ON_ONCE(!is_read_io(bio_op(bio)));
> > trace_f2fs_submit_read_bio(sbi->sb, type, bio);
> >
> > @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
> > struct dnode_of_data dn;
> > struct folio *folio;
> > int err;
> > -
> > +retry:
> > folio = f2fs_grab_cache_folio(mapping, index, for_write);
> > if (IS_ERR(folio))
> > return folio;
> >
> > + if (folio_test_large(folio)) {
> > + pgoff_t folio_index = mapping_align_index(mapping, index);
> > +
> > + f2fs_folio_put(folio, true);
> > + invalidate_inode_pages2_range(mapping, folio_index,
> > + folio_index + folio_nr_pages(folio) - 1);
> > + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
> > + goto retry;
> > + }
> > +
> > if (f2fs_lookup_read_extent_cache_block(inode, index,
> > &dn.data_blkaddr)) {
> > if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
> > @@ -2341,6 +2376,179 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
> > }
> > #endif
> >
> > +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
> > +{
> > + struct f2fs_folio_state *ffs = folio->private;
> > +
> > + if (ffs)
> > + return ffs;
> > +
> > + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> > +
> > + spin_lock_init(&ffs->state_lock);
> > + folio_attach_private(folio, ffs);
> > + return ffs;
> > +}
> > +
> > +static void ffs_detach_free(struct folio *folio)
> > +{
> > + struct f2fs_folio_state *ffs;
> > +
> > + if (!folio_test_large(folio)) {
> > + folio_detach_private(folio);
> > + return;
> > + }
> > +
> > + ffs = folio_detach_private(folio);
> > + if (!ffs)
> > + return;
> > +
> > + WARN_ON_ONCE(ffs->read_pages_pending != 0);
> > + kmem_cache_free(ffs_entry_slab, ffs);
> > +}
> > +
> > +static int f2fs_read_data_large_folio(struct inode *inode,
> > + struct readahead_control *rac, struct folio *folio)
> > +{
> > + struct bio *bio = NULL;
> > + sector_t last_block_in_bio = 0;
> > + struct f2fs_map_blocks map = {0, };
> > + pgoff_t index, offset;
> > + unsigned max_nr_pages = rac ? readahead_count(rac) :
> > + folio_nr_pages(folio);
> > + unsigned nrpages;
> > + struct f2fs_folio_state *ffs;
> > + int ret = 0;
> > +
> > + if (!IS_IMMUTABLE(inode))
> > + return -EOPNOTSUPP;
> > +
> > + if (f2fs_compressed_file(inode))
> > + return -EOPNOTSUPP;
> > +
> > + map.m_seg_type = NO_CHECK_TYPE;
> > +
> > + if (rac)
> > + folio = readahead_folio(rac);
> > +next_folio:
> > + if (!folio)
> > + goto out;
> > +
> > + index = folio->index;
> > + offset = 0;
> > + ffs = NULL;
> > + nrpages = folio_nr_pages(folio);
> > +
> > + for (; nrpages; nrpages--) {
> > + sector_t block_nr;
> > + /*
> > + * Map blocks using the previous result first.
> > + */
> > + if ((map.m_flags & F2FS_MAP_MAPPED) &&
> > + index > map.m_lblk &&
> > + index < (map.m_lblk + map.m_len))
> > + goto got_it;
> > +
> > + /*
> > + * Then do more f2fs_map_blocks() calls until we are
> > + * done with this page.
> > + */
> > + memset(&map, 0, sizeof(map));
> > + map.m_seg_type = NO_CHECK_TYPE;
> > + map.m_lblk = index;
> > + map.m_len = max_nr_pages;
> > +
> > + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
> > + if (ret)
> > + goto err_out;
> > +got_it:
> > + if ((map.m_flags & F2FS_MAP_MAPPED)) {
> > + block_nr = map.m_pblk + index - map.m_lblk;
> > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
> > + DATA_GENERIC_ENHANCE_READ)) {
> > + ret = -EFSCORRUPTED;
> > + goto err_out;
> > + }
> > + } else {
> > + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
> > + if (f2fs_need_verity(inode, index) &&
> > + !fsverity_verify_page(folio_file_page(folio,
> > + index))) {
> > + ret = -EIO;
> > + goto err_out;
> > + }
> > + continue;
> > + }
> > +
> > + /*
> > + * This page will go to BIO. Do we need to send this
> > + * BIO off first?
> > + */
> > + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> > + last_block_in_bio, block_nr) ||
> > + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> > +submit_and_realloc:
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + bio = NULL;
> > + }
> > + if (bio == NULL)
> > + bio = f2fs_grab_read_bio(inode, block_nr,
> > + max_nr_pages,
> > + f2fs_ra_op_flags(rac),
> > + index, false);
> > +
> > + /*
> > + * If the page is under writeback, we need to wait for
> > + * its completion to see the correct decrypted data.
> > + */
> > + f2fs_wait_on_block_writeback(inode, block_nr);
> > +
> > + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> > + offset << PAGE_SHIFT))
>
> What do you think of adding physical contiguous folio in batch? I guess we can
> reduce loop number of calling several functions here?
Possible as a separate patch, but not sure how much gain we can get.
>
> Thanks,
>
> > + goto submit_and_realloc;
> > +
> > + if (folio_test_large(folio)) {
> > + ffs = ffs_find_or_alloc(folio);
> > +
> > + /* set the bitmap to wait */
> > + spin_lock_irq(&ffs->state_lock);
> > + ffs->read_pages_pending++;
> > + spin_unlock_irq(&ffs->state_lock);
> > + }
> > +
> > + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
> > + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
> > + F2FS_BLKSIZE);
> > + last_block_in_bio = block_nr;
> > + index++;
> > + offset++;
> > + }
> > + if (rac) {
> > + folio = readahead_folio(rac);
> > + goto next_folio;
> > + }
> > +err_out:
> > + /* Nothing was submitted. */
> > + if (!bio) {
> > + if (!ret)
> > + folio_mark_uptodate(folio);
> > + folio_unlock(folio);
> > + return ret;
> > + }
> > +
> > + if (ret) {
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > +
> > + /* Wait bios and clear uptodate. */
> > + folio_lock(folio);
> > + folio_clear_uptodate(folio);
> > + folio_unlock(folio);
> > + }
> > +out:
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + return ret;
> > +}
> > +
> > /*
> > * This function was originally taken from fs/mpage.c, and customized for f2fs.
> > * Major change was from block_size == page_size in f2fs by default.
> > @@ -2366,9 +2574,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
> > pgoff_t index;
> > #endif
> > unsigned nr_pages = rac ? readahead_count(rac) : 1;
> > + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
> > unsigned max_nr_pages = nr_pages;
> > int ret = 0;
> >
> > + if (mapping_large_folio_support(mapping))
> > + return f2fs_read_data_large_folio(inode, rac, folio);
> > +
> > #ifdef CONFIG_F2FS_FS_COMPRESSION
> > if (f2fs_compressed_file(inode)) {
> > index = rac ? readahead_index(rac) : folio->index;
> > @@ -2459,8 +2671,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
> > }
> > #endif
> > }
> > - if (bio)
> > - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > return ret;
> > }
> >
> > @@ -3747,7 +3958,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
> > f2fs_remove_dirty_inode(inode);
> > }
> > }
> > - folio_detach_private(folio);
> > +
> > + if (offset || length != folio_size(folio))
> > + return;
> > +
> > + folio_cancel_dirty(folio);
> > + ffs_detach_free(folio);
> > }
> >
> > bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> > @@ -3756,7 +3972,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
> > if (folio_test_dirty(folio))
> > return false;
> >
> > - folio_detach_private(folio);
> > + ffs_detach_free(folio);
> > return true;
> > }
> >
> > @@ -4162,12 +4378,25 @@ int __init f2fs_init_bio_entry_cache(void)
> > {
> > bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
> > sizeof(struct bio_entry));
> > - return bio_entry_slab ? 0 : -ENOMEM;
> > +
> > + if (!bio_entry_slab)
> > + return -ENOMEM;
> > +
> > + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
> > + sizeof(struct f2fs_folio_state));
> > +
> > + if (!ffs_entry_slab) {
> > + kmem_cache_destroy(bio_entry_slab);
> > + return -ENOMEM;
> > + }
> > +
> > + return 0;
> > }
> >
> > void f2fs_destroy_bio_entry_cache(void)
> > {
> > kmem_cache_destroy(bio_entry_slab);
> > + kmem_cache_destroy(ffs_entry_slab);
> > }
> >
> > static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 86785068554f..d7600979218e 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -4928,6 +4928,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
> > return false;
> > }
> >
> > +static inline bool f2fs_quota_file(struct inode *inode)
> > +{
> > +#ifdef CONFIG_QUOTA
> > + int i;
> > +
> > + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
> > + return false;
> > +
> > + for (i = 0; i < MAXQUOTAS; i++) {
> > + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
> > + return true;
> > + }
> > +#endif
> > + return false;
> > +}
> > +
> > static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
> > {
> > return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index d7047ca6b98d..e75e61ac50d7 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -624,6 +624,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
> > if (!f2fs_is_compress_backend_ready(inode))
> > return -EOPNOTSUPP;
> >
> > + if (mapping_large_folio_support(inode->i_mapping) &&
> > + filp->f_mode & FMODE_WRITE)
> > + return -EOPNOTSUPP;
> > +
> > err = fsverity_file_open(inode, filp);
> > if (err)
> > return err;
> > diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> > index e2405b79b3cc..9162154d5211 100644
> > --- a/fs/f2fs/inode.c
> > +++ b/fs/f2fs/inode.c
> > @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > if (ret)
> > goto bad_inode;
> > make_now:
> > + f2fs_set_inode_flags(inode);
> > +
> > if (ino == F2FS_NODE_INO(sbi)) {
> > inode->i_mapping->a_ops = &f2fs_node_aops;
> > mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
> > @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > inode->i_op = &f2fs_file_inode_operations;
> > inode->i_fop = &f2fs_file_operations;
> > inode->i_mapping->a_ops = &f2fs_dblock_aops;
> > + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
> > + !f2fs_quota_file(inode))
> > + mapping_set_folio_min_order(inode->i_mapping, 0);
> > } else if (S_ISDIR(inode->i_mode)) {
> > inode->i_op = &f2fs_dir_inode_operations;
> > inode->i_fop = &f2fs_dir_operations;
> > @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
> > ret = -EIO;
> > goto bad_inode;
> > }
> > - f2fs_set_inode_flags(inode);
> >
> > unlock_new_inode(inode);
> > trace_f2fs_iget(inode);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
2025-11-20 23:54 [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Jaegeuk Kim
` (2 preceding siblings ...)
2025-11-22 1:18 ` [PATCH 1/2 v2] " Jaegeuk Kim
@ 2025-12-16 19:20 ` patchwork-bot+f2fs
3 siblings, 0 replies; 20+ messages in thread
From: patchwork-bot+f2fs @ 2025-12-16 19:20 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel
Hello:
This series was applied to jaegeuk/f2fs.git (dev)
by Jaegeuk Kim <jaegeuk@kernel.org>:
On Thu, 20 Nov 2025 23:54:45 +0000 you wrote:
> This patch enables large folio for limited case where we can get the high-order
> memory allocation. It supports the encrypted and fsverity files, which are
> essential for Android environment.
>
> How to test:
> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
> - f2fs_io setflags immutable /mnt/test/test
> - echo 3 > /proc/sys/vm/drop_caches
> : to reload inode with large folio
> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>
> [...]
Here is the summary with links:
- [f2fs-dev,1/2] f2fs: support large folio for immutable non-compressed case
(no matching commit)
- [f2fs-dev,2/2] f2fs: add a tracepoint to see large folio read submission
https://git.kernel.org/jaegeuk/f2fs/c/903c6e95bc9a
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2025-12-09 18:38 ` Jaegeuk Kim
@ 2026-01-01 11:20 ` Nanzhe Zhao
2026-01-02 6:23 ` Jaegeuk Kim
0 siblings, 1 reply; 20+ messages in thread
From: Nanzhe Zhao @ 2026-01-01 11:20 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: Chao Yu, linux-kernel, linux-f2fs-devel
Dear Kim:
Happy New Year!
> +static struct f2fs_folio_state *
> +ffs_find_or_alloc(struct folio *folio)
> +{
> + struct f2fs_folio_state *ffs = folio->private;
> +
> + if (ffs)
> + return ffs;
> +
> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> +
> + spin_lock_init(&ffs->state_lock);
> + folio_attach_private(folio, ffs);
> + return ffs;
> +}
It looks like ffs_find_or_alloc() does not initialize
read_pages_pending.
When I debug locally, printing read_pages_pending shows an undefined
random value. Also, when I run a basic read test with dd, tasks can hang
(because read_pages_pending never reaches zero, so the folio is never
unlocked and never marked uptodate).
I know this function is modeled after iomap's ifs_alloc():
static struct iomap_folio_state *ifs_alloc(struct inode *inode,
struct folio *folio, unsigned int flags)
{
struct iomap_folio_state *ifs = folio->private;
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp;
if (ifs || nr_blocks <= 1)
return ifs;
/*...*/
/*
* ifs->state tracks two sets of state flags when the
* filesystem block size is smaller than the folio size.
* The first state tracks per-block uptodate and the
* second tracks per-block dirty state.
*/
ifs = kzalloc(struct_size(ifs, state,
BITS_TO_LONGS(2 * nr_blocks)), gfp);
if (!ifs)
return ifs;
spin_lock_init(&ifs->state_lock);
if (folio_test_uptodate(folio))
bitmap_set(ifs->state, 0, nr_blocks);
if (folio_test_dirty(folio))
bitmap_set(ifs->state, nr_blocks, nr_blocks);
folio_attach_private(folio, ifs);
return ifs;
}
Note ifs_alloc() uses kzalloc(), which zero-initializes the allocated
memory by default while f2fs_kmem_cache_alloc() does not.
We could fix this by explicitly setting read_pages_pending = 0,
or by doing a memset() right after f2fs_kmem_cache_alloc()
(the latter seems more extensible if the struct grows). What do you think?
> /*
> + * This page will go to BIO. Do we need to send this
> + * BIO off first?
> + */
> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> + last_block_in_bio, block_nr) ||
> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> +submit_and_realloc:
> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> + bio = NULL;
> + }
> + if (bio == NULL)
> + bio = f2fs_grab_read_bio(inode, block_nr,
> + max_nr_pages,
> + f2fs_ra_op_flags(rac),
> + index, false);
> +
> + /*
> + * If the page is under writeback, we need to wait for
> + * its completion to see the correct decrypted data.
> + */
> + f2fs_wait_on_block_writeback(inode, block_nr);
> +
> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> + offset << PAGE_SHIFT))
> + goto submit_and_realloc;
> +
> + if (folio_test_large(folio)) {
> + ffs = ffs_find_or_alloc(folio);
> +
> + /* set the bitmap to wait */
> + spin_lock_irq(&ffs->state_lock);
> + ffs->read_pages_pending++;
> + spin_unlock_irq(&ffs->state_lock);
> + }
In the current code, it looks like a subpage is added to the BIO (or a
cached BIO is submitted) before read_pages_pending is incremented.
This can cause the following behaviour:
After one subpage of a folio is submitted, if the I/O completes very
fast, the endio path may interrupt the read loop, run bio_endio, and
eventually call f2fs_finish_read_bio(), which decrements read_pages_pending
down to zero. That can make folio_finish_read() run too early, even
though other parts of the same folio have not been added to a BIO yet.
I managed to trigger this locally by creating a heavily fragmented file
and temporarily injecting the following code right after BIO submission:
f2fs_io_schedule_timeout(1);
WARN_ON_ONCE(!folio_test_locked(folio));
I think the correct ordering is to increment read_pages_pending first,
and then add the corresponding subpage to the BIO.
In that ordering, the BIO side will either:
1) add a subpage after the increment (matching the new pending count),
or
2) submit a BIO that corresponds to the pending increment from the
** previous iteration **,
so read_pages_pending will not reach zero prematurely.
This is exactly the order that iomap_readpage_iter() implements.
If you need the script I used to reproduce the bug, please let me know.
I will attach it in my next reply. Thanks!
Best regards,
Nanzhe Zhao
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2026-01-01 11:20 ` Nanzhe Zhao
@ 2026-01-02 6:23 ` Jaegeuk Kim
2026-01-03 10:54 ` Nanzhe Zhao
0 siblings, 1 reply; 20+ messages in thread
From: Jaegeuk Kim @ 2026-01-02 6:23 UTC (permalink / raw)
To: Nanzhe Zhao; +Cc: linux-kernel, linux-f2fs-devel
Hi Nanzhe,
On 01/01, Nanzhe Zhao wrote:
> Dear Kim:
> Happy New Year!
>
> > +static struct f2fs_folio_state *
> > +ffs_find_or_alloc(struct folio *folio)
> > +{
> > + struct f2fs_folio_state *ffs = folio->private;
> > +
> > + if (ffs)
> > + return ffs;
> > +
> > + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
> > +
> > + spin_lock_init(&ffs->state_lock);
> > + folio_attach_private(folio, ffs);
> > + return ffs;
> > +}
>
> It looks like ffs_find_or_alloc() does not initialize
> read_pages_pending.
> When I debug locally, printing read_pages_pending shows an undefined
> random value. Also, when I run a basic read test with dd, tasks can hang
> (because read_pages_pending never reaches zero, so the folio is never
> unlocked and never marked uptodate).
>
> I know this function is modeled after iomap's ifs_alloc():
>
> static struct iomap_folio_state *ifs_alloc(struct inode *inode,
> struct folio *folio, unsigned int flags)
> {
> struct iomap_folio_state *ifs = folio->private;
> unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> gfp_t gfp;
>
> if (ifs || nr_blocks <= 1)
> return ifs;
> /*...*/
> /*
> * ifs->state tracks two sets of state flags when the
> * filesystem block size is smaller than the folio size.
> * The first state tracks per-block uptodate and the
> * second tracks per-block dirty state.
> */
> ifs = kzalloc(struct_size(ifs, state,
> BITS_TO_LONGS(2 * nr_blocks)), gfp);
> if (!ifs)
> return ifs;
>
> spin_lock_init(&ifs->state_lock);
> if (folio_test_uptodate(folio))
> bitmap_set(ifs->state, 0, nr_blocks);
> if (folio_test_dirty(folio))
> bitmap_set(ifs->state, nr_blocks, nr_blocks);
> folio_attach_private(folio, ifs);
>
> return ifs;
> }
>
> Note ifs_alloc() uses kzalloc(), which zero-initializes the allocated memory
> by default while f2fs_kmem_cache_alloc() does not.
>
> We could fix this by explicitly setting read_pages_pending = 0,
> or by doing a memset() right after f2fs_kmem_cache_alloc()
> (the latter seems more extensible if the struct grows). What do you think?
Agreed. What about adding __GFP_ZERO for f2fs_kmem_cache_alloc()?
>
> > /*
> > + * This page will go to BIO. Do we need to send this
> > + * BIO off first?
> > + */
> > + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
> > + last_block_in_bio, block_nr) ||
> > + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
> > +submit_and_realloc:
> > + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
> > + bio = NULL;
> > + }
> > + if (bio == NULL)
> > + bio = f2fs_grab_read_bio(inode, block_nr,
> > + max_nr_pages,
> > + f2fs_ra_op_flags(rac),
> > + index, false);
> > +
> > + /*
> > + * If the page is under writeback, we need to wait for
> > + * its completion to see the correct decrypted data.
> > + */
> > + f2fs_wait_on_block_writeback(inode, block_nr);
> > +
> > + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
> > + offset << PAGE_SHIFT))
> > + goto submit_and_realloc;
> > +
> > + if (folio_test_large(folio)) {
> > + ffs = ffs_find_or_alloc(folio);
> > +
> > + /* set the bitmap to wait */
> > + spin_lock_irq(&ffs->state_lock);
> > + ffs->read_pages_pending++;
> > + spin_unlock_irq(&ffs->state_lock);
> > + }
>
> In the current code, it looks like a subpage is added to the BIO (or a
> cached BIO is submitted) before read_pages_pending is incremented.
> This can cause the following behaviour:
>
> After one subpage of a folio is submitted, if the I/O completes very
> fast, the endio path may interrupt the read loop, run bio_endio, and
> eventually call f2fs_finish_read_bio(), which decrements read_pages_pending
> down to zero. That can make folio_finish_read() run too early, even though
> other parts of the same folio have not been added to a BIO yet.
>
> I managed to trigger this locally by creating a heavily fragmented file
> and temporarily injecting the following code right after BIO submission:
>
> f2fs_io_schedule_timeout(1);
> WARN_ON_ONCE(!folio_test_locked(folio));
>
> I think the correct ordering is to increment read_pages_pending first,
> and then add the corresponding subpage to the BIO.
> In that ordering, the BIO side will either:
> 1) add a subpage after the increment (matching the new pending count),
> or
> 2) submit a BIO that corresponds to the pending increment from the
> ** previous iteration **,
> so read_pages_pending will not reach zero prematurely.
> This is exactly the order that iomap_readpage_iter() implements.
>
> If you need the script I used to reproduce the bug, please let me know.
> I will attach it in my next reply. Thanks!
I think this is also valid. If possible, could you please post patches to
fix these two bugs?
Thanks,
>
> Best regards,
> Nanzhe Zhao
>
>
>
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re:Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2026-01-02 6:23 ` Jaegeuk Kim
@ 2026-01-03 10:54 ` Nanzhe Zhao
2026-01-04 3:20 ` Jaegeuk Kim
0 siblings, 1 reply; 20+ messages in thread
From: Nanzhe Zhao @ 2026-01-03 10:54 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: Chao Yu, linux-kernel, linux-f2fs-devel
Dear Kim:
Thanks for your quick reply!
I applied the two bug fixes on my local branch and found that
I still couldn't pass my test of generating and reading a heavily
fragmented file.
The root cause is that current code will treat hole blocks as mapped
blocks as well and mistakenly increment read_pages_pending, resulting
task hung in readahead.
Inside f2fs_map_blocks():
/* DIO READ and hole case, should not map the blocks. */
if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create))
map->m_flags |= F2FS_MAP_MAPPED;
it will have map->m_flags marked with F2FS_MAP_MAPPED in non-DIO and
no blocks creation context for NULL_ADDR and NEW_ADDR, except for
holes mapped to an unallocated dnode.
Personally, I think a better fix is to add a helper function
f2fs_block_needs_zeroing(). The condition could be: return true if the
current blkaddr is NULL_ADDR or NEW_ADDR.
Then we can reverse the order of the checks under the got_it: label:
first `if (f2fs_block_needs_zeroing()) ...`, and then `else if
(map->m_flags & F2FS_MAP_MAPPED)`, while keeping all the logic inside
those statements unchanged.
For the parameters of f2fs_block_needs_zeroing(), I think we can pass
`struct f2fs_map_blocks` directly, because it already contains all the
information we need. Also, if we later want to support batching
contiguous physical block mappings and bio additions inside the loop,
this signature should be more extensible.
If you think this approach makes sense, I can send a patch to fix all
three bugs. Thank you.
Best regards,
Nanzhe Zhao
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [f2fs-dev] [PATCH 1/2 v4] f2fs: support large folio for immutable non-compressed case
2026-01-03 10:54 ` Nanzhe Zhao
@ 2026-01-04 3:20 ` Jaegeuk Kim
0 siblings, 0 replies; 20+ messages in thread
From: Jaegeuk Kim @ 2026-01-04 3:20 UTC (permalink / raw)
To: Nanzhe Zhao; +Cc: Chao Yu, linux-kernel, linux-f2fs-devel
On 01/03, Nanzhe Zhao wrote:
> Dear Kim:
> Thanks for your quick reply!
>
> I applied the two bug fixes on my local branch and found that
> I still couldn't pass my test of generating and reading a heavily
> fragmented file.
>
> The root cause is that current code will treat hole blocks as mapped
> blocks as well and mistakenly increment read_pages_pending, resulting
> task hung in readahead.
>
> Inside f2fs_map_blocks():
>
> /* DIO READ and hole case, should not map the blocks. */
> if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create))
> map->m_flags |= F2FS_MAP_MAPPED;
>
> it will have map->m_flags marked with F2FS_MAP_MAPPED in non-DIO and
> no blocks creation context for NULL_ADDR and NEW_ADDR, except for
> holes mapped to an unallocated dnode.
>
> Personally, I think a better fix is to add a helper function
> f2fs_block_needs_zeroing(). The condition could be: return true if the
> current blkaddr is NULL_ADDR or NEW_ADDR.
>
> Then we can reverse the order of the checks under the got_it: label:
> first `if (f2fs_block_needs_zeroing()) ...`, and then `else if
> (map->m_flags & F2FS_MAP_MAPPED)`, while keeping all the logic inside
> those statements unchanged.
>
> For the parameters of f2fs_block_needs_zeroing(), I think we can pass
> `struct f2fs_map_blocks` directly, because it already contains all the
> information we need. Also, if we later want to support batching
> contiguous physical block mappings and bio additions inside the loop,
> this signature should be more extensible.
>
> If you think this approach makes sense, I can send a patch to fix all
> three bugs. Thank you.
I think that's feasbile. Could you please post a patch to discuss further?
Thanks,
>
> Best regards,
> Nanzhe Zhao
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2026-01-04 3:20 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-20 23:54 [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Jaegeuk Kim
2025-11-20 23:54 ` [PATCH 2/2] f2fs: add a tracepoint to see large folio read submission Jaegeuk Kim
2025-11-21 10:23 ` [f2fs-dev] " Chao Yu
2025-11-21 10:20 ` [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case Chao Yu
2025-11-22 1:17 ` Jaegeuk Kim
2025-11-25 1:38 ` Chao Yu
2025-12-01 19:31 ` [f2fs-dev] [PATCH 1/2 v2] " Jaegeuk Kim
2025-12-01 21:37 ` Chao Yu
2025-12-01 22:30 ` [f2fs-dev] [PATCH 1/2 v3] " Jaegeuk Kim
2025-12-01 22:37 ` Chao Yu
2025-12-02 2:38 ` [f2fs-dev] [PATCH 1/2 v4] " Jaegeuk Kim
2025-12-02 18:07 ` Chao Yu
2025-12-09 8:32 ` Chao Yu
2025-12-09 18:38 ` Jaegeuk Kim
2026-01-01 11:20 ` Nanzhe Zhao
2026-01-02 6:23 ` Jaegeuk Kim
2026-01-03 10:54 ` Nanzhe Zhao
2026-01-04 3:20 ` Jaegeuk Kim
2025-11-22 1:18 ` [PATCH 1/2 v2] " Jaegeuk Kim
2025-12-16 19:20 ` [f2fs-dev] [PATCH 1/2] " patchwork-bot+f2fs
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox