public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jaegeuk Kim <jaegeuk@kernel.org>
To: linux-kernel@vger.kernel.org, linux-f2fs-devel@lists.sourceforge.net
Cc: Akilesh Kailash <akailash@google.com>
Subject: Re: [PATCH v2] f2fs: another way to set large folio by remembering inode number
Date: Fri, 10 Apr 2026 01:16:23 +0000	[thread overview]
Message-ID: <adhPZxtbZxgU-37v@google.com> (raw)
In-Reply-To: <20260409134538.3692605-1-jaegeuk@kernel.org>

enum {
       F2FS_XATTR_FADV_LARGEFOLIO,
};

unsigned int value = BIT(F2FS_XATTR_FADV_LARGEFOLIO);

1. setxattr(file, "user.fadvise", &value, sizeof(unsigned int), 0)
 -> register the inode number for large folio
2. chmod(0400, file)
 -> make Read-Only
3. fsync() && close() && open(READ)
 -> f2fs_iget() with large folio
4. open(WRITE), mkwrite on mmap, chmod(WRITE)
 -> return error
5. close() and open()
 -> goto #3
6. unlink
 -> deregister the inode number

Suggested-by: Akilesh Kailash <akailash@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 
  Log from v1:
   - add a condition in f2fs_drop_inode
   - add Doc

 Documentation/filesystems/f2fs.rst | 41 ++++++++++++++++++++++++++----
 fs/f2fs/checkpoint.c               |  2 +-
 fs/f2fs/data.c                     |  2 +-
 fs/f2fs/f2fs.h                     |  1 +
 fs/f2fs/file.c                     | 11 ++++++--
 fs/f2fs/inode.c                    | 19 +++++++++++---
 fs/f2fs/super.c                    |  7 +++++
 fs/f2fs/xattr.c                    | 35 ++++++++++++++++++++++++-
 fs/f2fs/xattr.h                    |  6 +++++
 9 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 7e4031631286..de899d0d3088 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -1044,11 +1044,14 @@ page allocation for significant performance gains. To minimize code complexity,
 this support is currently excluded from the write path, which requires handling
 complex optimizations such as compression and block allocation modes.
 
-This optional feature is triggered only when a file's immutable bit is set.
-Consequently, F2FS will return EOPNOTSUPP if a user attempts to open a cached
-file with write permissions, even immediately after clearing the bit. Write
-access is only restored once the cached inode is dropped. The usage flow is
-demonstrated below:
+This optional feature is triggered by two mechanisms: the file's immutable bit
+or a specific xattr flag. In both cases, F2FS ensures data integrity by
+restricting the file to a read-only state while large folios are active.
+
+1. Immutable Bit Approach:
+Triggered when the FS_IMMUTABLE_FL is set. This is a strict enforcement
+where the file cannot be modified at all until the bit is cleared and
+the cached inode is dropped.
 
 .. code-block::
 
@@ -1078,3 +1081,31 @@ demonstrated below:
    Written 4096 bytes with pattern = zero, total_time = 29 us, max_latency = 28 us
 
    # rm /data/testfile_read_seq
+
+2. XATTR fadvise Approach:
+A more flexible registration via extended attributes.
+
+.. code-block::
+
+    enum {
+        F2FS_XATTR_FADV_LARGEFOLIO,
+    };
+    unsigned int value = BIT(F2FS_XATTR_FADV_LARGEFOLIO);
+
+    /* Registers the inode number for large folio support in the subsystem.*/
+    # setxattr(file, "user.fadvise", &value, sizeof(unsigned int), 0)
+
+    /* The file must be made Read-Only to transition into the large folio path. */
+    # fchmod(0400, fd)
+
+    /* clean up dirty inode state. */
+    # fsync(fd)
+
+    /* Drop the inode cache.
+    # close(fd)
+
+    /* f2fs_iget() instantiates the inode with large folio support.*/
+    # open()
+
+    /* Returns -EOPNOTSUPP or error to protect the large folio cache.*/
+    # open(WRITE), mkwrite on mmap, or chmod(WRITE)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 01e1ba77263e..fdd62ddc3ed6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -778,7 +778,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 	__remove_ino_entry(sbi, ino, type);
 }
 
-/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
+/* mode should be APPEND_INO, UPDATE_INO, LARGE_FOLIO_IO, or TRANS_DIR_INO */
 bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
 {
 	struct inode_management *im = &sbi->im[mode];
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 965d4e6443c6..5e46230398d7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2494,7 +2494,7 @@ static int f2fs_read_data_large_folio(struct inode *inode,
 	int ret = 0;
 	bool folio_in_bio;
 
-	if (!IS_IMMUTABLE(inode) || f2fs_compressed_file(inode)) {
+	if (f2fs_compressed_file(inode)) {
 		if (folio)
 			folio_unlock(folio);
 		return -EOPNOTSUPP;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e40b6b2784ee..02bc6eb96a59 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -381,6 +381,7 @@ enum {
 /* for the list of ino */
 enum {
 	ORPHAN_INO,		/* for orphan ino list */
+	LARGE_FOLIO_INO,	/* for large folio case */
 	APPEND_INO,		/* for append ino list */
 	UPDATE_INO,		/* for update ino list */
 	TRANS_DIR_INO,		/* for transactions dir ino list */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c0220cd7b332..64ba900410fc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2068,9 +2068,16 @@ static long f2fs_fallocate(struct file *file, int mode,
 
 static int f2fs_release_file(struct inode *inode, struct file *filp)
 {
-	if (atomic_dec_and_test(&F2FS_I(inode)->open_count))
+	if (atomic_dec_and_test(&F2FS_I(inode)->open_count)) {
 		f2fs_remove_donate_inode(inode);
-
+		/*
+		 * In order to get large folio as soon as possible, let's drop
+		 * inode cache asap. See also f2fs_drop_inode.
+		 */
+		if (f2fs_exist_written_data(F2FS_I_SB(inode),
+					    inode->i_ino, LARGE_FOLIO_INO))
+                       d_drop(filp->f_path.dentry);
+	}
 	/*
 	 * f2fs_release_file is called at every close calls. So we should
 	 * not drop any inmemory pages by close called by other process.
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 89240be8cc59..e100bc5a378c 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -565,6 +565,20 @@ static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino)
 		ino == F2FS_COMPRESS_INO(sbi);
 }
 
+static void f2fs_mapping_set_large_folio(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (f2fs_compressed_file(inode))
+		return;
+	if (f2fs_quota_file(sbi, inode->i_ino))
+		return;
+	if (IS_IMMUTABLE(inode) ||
+	    (f2fs_exist_written_data(sbi, inode->i_ino, LARGE_FOLIO_INO) &&
+	     !(inode->i_mode & S_IWUGO)))
+	    mapping_set_folio_min_order(inode->i_mapping, 0);
+}
+
 struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -620,9 +634,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &f2fs_file_inode_operations;
 		inode->i_fop = &f2fs_file_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
-		if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
-		    !f2fs_quota_file(sbi, inode->i_ino))
-			mapping_set_folio_min_order(inode->i_mapping, 0);
+		f2fs_mapping_set_large_folio(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &f2fs_dir_inode_operations;
 		inode->i_fop = &f2fs_dir_operations;
@@ -895,6 +907,7 @@ void f2fs_evict_inode(struct inode *inode)
 	f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
 	f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
+	f2fs_remove_ino_entry(sbi, inode->i_ino, LARGE_FOLIO_INO);
 
 	if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) {
 		sb_start_intwrite(inode->i_sb);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ccf806b676f5..11d1e0c99ac1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1844,6 +1844,13 @@ static int f2fs_drop_inode(struct inode *inode)
 			return 1;
 		}
 	}
+	/*
+	 * In order to get large folio as soon as possible, let's drop
+	 * inode cache asap. See also f2fs_release_file.
+	 */
+	if (f2fs_exist_written_data(sbi, inode->i_ino, LARGE_FOLIO_INO) &&
+	    !is_inode_flag_set(inode, FI_DIRTY_INODE))
+		return 1;
 
 	/*
 	 * This is to avoid a deadlock condition like below.
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 941dc62a6d6f..0c0e44c2dcdd 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -44,6 +44,16 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
 		kfree(xattr_addr);
 }
 
+static int f2fs_xattr_fadvise_get(struct inode *inode, void *buffer)
+{
+	if (!buffer)
+		goto out;
+	if (mapping_large_folio_support(inode->i_mapping))
+		*((unsigned int *)buffer) |= BIT(F2FS_XATTR_FADV_LARGEFOLIO);
+out:
+	return sizeof(unsigned int);
+}
+
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 		struct dentry *unused, struct inode *inode,
 		const char *name, void *buffer, size_t size)
@@ -61,10 +71,29 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
+	if (handler->flags == F2FS_XATTR_INDEX_USER &&
+	    !strcmp(name, "fadvise"))
+		return f2fs_xattr_fadvise_get(inode, buffer);
+
 	return f2fs_getxattr(inode, handler->flags, name,
 			     buffer, size, NULL);
 }
 
+static int f2fs_xattr_fadvise_set(struct inode *inode, const void *value)
+{
+	unsigned int new_fadvise;
+
+	new_fadvise = *(unsigned int *)value;
+
+	if (new_fadvise & BIT(F2FS_XATTR_FADV_LARGEFOLIO))
+		f2fs_add_ino_entry(F2FS_I_SB(inode),
+				inode->i_ino, LARGE_FOLIO_INO);
+	else
+		f2fs_remove_ino_entry(F2FS_I_SB(inode),
+				inode->i_ino, LARGE_FOLIO_INO);
+	return 0;
+}
+
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 		struct mnt_idmap *idmap,
 		struct dentry *unused, struct inode *inode,
@@ -84,6 +113,10 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
+	if (handler->flags == F2FS_XATTR_INDEX_USER &&
+	    !strcmp(name, "fadvise"))
+		return f2fs_xattr_fadvise_set(inode, value);
+
 	return f2fs_setxattr(inode, handler->flags, name,
 					value, size, NULL, flags);
 }
@@ -842,4 +875,4 @@ int __init f2fs_init_xattr_cache(void)
 void f2fs_destroy_xattr_cache(void)
 {
 	kmem_cache_destroy(inline_xattr_slab);
-}
\ No newline at end of file
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index bce3d93e4755..455f460d014e 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -24,6 +24,7 @@
 #define F2FS_XATTR_REFCOUNT_MAX         1024
 
 /* Name indexes */
+#define F2FS_USER_FADVISE_NAME			"user.fadvise"
 #define F2FS_SYSTEM_ADVISE_NAME			"system.advise"
 #define F2FS_XATTR_INDEX_USER			1
 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS	2
@@ -39,6 +40,11 @@
 #define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT	"c"
 #define F2FS_XATTR_NAME_VERITY			"v"
 
+/* used for F2FS_USER_FADVISE_NAME */
+enum {
+	F2FS_XATTR_FADV_LARGEFOLIO,
+};
+
 struct f2fs_xattr_header {
 	__le32  h_magic;        /* magic number for identification */
 	__le32  h_refcount;     /* reference count */
-- 
2.53.0.1213.gd9a14994de-goog


      reply	other threads:[~2026-04-10  1:16 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-09 13:45 [PATCH] f2fs: another way to set large folio by remembering inode number Jaegeuk Kim
2026-04-10  1:16 ` Jaegeuk Kim [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=adhPZxtbZxgU-37v@google.com \
    --to=jaegeuk@kernel.org \
    --cc=akailash@google.com \
    --cc=linux-f2fs-devel@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox