Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH] ext4: add ext4_dir_entry_is_tail()
From: Artem Blagodarenko @ 2026-05-26 23:38 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko

From: Artem Blagodarenko <artem.blagodarenko@gmail.com>

Replace open-coded checks for directory tail entries with a call
to ext4_dir_entry_is_tail(). This helper will also be used by
upcoming changes.

Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
 fs/ext4/ext4.h  | 16 ++++++++++++++++
 fs/ext4/namei.c |  7 +------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..01b1222b1454 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3917,6 +3917,22 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 		io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
 }
 
+/*
+ * ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
+ * @de: directory entry to check
+ *
+ * Returns true if @de is a directory block tail entry (checksum record).
+ */
+static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
+{
+	struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de;
+
+	return !t->det_reserved_zero1 &&
+	       le16_to_cpu(t->det_rec_len) == sizeof(*t) &&
+	       !t->det_reserved_zero2 &&
+	       t->det_reserved_ft == EXT4_FT_DIR_CSUM;
+}
+
 extern const struct iomap_ops ext4_iomap_ops;
 extern const struct iomap_ops ext4_iomap_report_ops;
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..accf63fbbc79 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -314,7 +314,6 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 						   struct buffer_head *bh)
 {
 	struct ext4_dir_entry_tail *t;
-	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
 
 #ifdef PARANOID
 	struct ext4_dir_entry *d, *top;
@@ -334,11 +333,7 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 	t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
 #endif
 
-	if (t->det_reserved_zero1 ||
-	    (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
-	     sizeof(struct ext4_dir_entry_tail)) ||
-	    t->det_reserved_zero2 ||
-	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+	if (!ext4_dir_entry_is_tail((struct ext4_dir_entry_2 *)t))
 		return NULL;
 
 	return t;
-- 
2.43.7


^ permalink raw reply related

* [PATCH] ext4: replace ext4_dir_entry with ext4_dir_entry_2
From: Artem Blagodarenko @ 2026-05-26 23:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko

From: Artem Blagodarenko <artem.blagodarenko@gmail.com>

Replace remaining uses of struct ext4_dir_entry in namei.c
with struct ext4_dir_entry_2.

The code paths affected by this change already depend on the
filetype feature, so using struct ext4_dir_entry_2 is
appropriate and avoids mixing the two directory entry types
unnecessarily.

This change does not affect support for 16-bit rec_len.

Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
 fs/ext4/namei.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..a316fc2ac41b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -102,7 +102,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
 }
 
 static int ext4_dx_csum_verify(struct inode *inode,
-			       struct ext4_dir_entry *dirent);
+			       struct ext4_dir_entry_2 *dirent);
 
 /*
  * Hints to ext4_read_dirblock regarding whether we expect a directory
@@ -128,7 +128,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 						unsigned int line)
 {
 	struct buffer_head *bh;
-	struct ext4_dir_entry *dirent;
+	struct ext4_dir_entry_2 *dirent;
 	int is_dx_block = 0;
 
 	if (block >= inode->i_size >> inode->i_blkbits) {
@@ -160,7 +160,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	}
 	if (!bh)
 		return NULL;
-	dirent = (struct ext4_dir_entry *) bh->b_data;
+	dirent = (struct ext4_dir_entry_2 *) bh->b_data;
 	/* Determine whether or not we have an index block */
 	if (is_dx(inode)) {
 		if (block == 0)
@@ -317,13 +317,13 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
 
 #ifdef PARANOID
-	struct ext4_dir_entry *d, *top;
+	struct ext4_dir_entry_2 *d, *top;
 
-	d = (struct ext4_dir_entry *)bh->b_data;
-	top = (struct ext4_dir_entry *)(bh->b_data +
+	d = (struct ext4_dir_entry_2 *)bh->b_data;
+	top = (struct ext4_dir_entry_2 *)(bh->b_data +
 		(blocksize - sizeof(struct ext4_dir_entry_tail)));
 	while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
-		d = (struct ext4_dir_entry *)(((void *)d) +
+		d = (struct ext4_dir_entry_2 *)(((void *)d) +
 		    ext4_rec_len_from_disk(d->rec_len, blocksize));
 
 	if (d != top)
@@ -410,10 +410,10 @@ int ext4_handle_dirty_dirblock(handle_t *handle,
 }
 
 static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
-					       struct ext4_dir_entry *dirent,
+					       struct ext4_dir_entry_2 *dirent,
 					       int *offset)
 {
-	struct ext4_dir_entry *dp;
+	struct ext4_dir_entry_2 *de;
 	struct dx_root_info *root;
 	int count_offset;
 	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
@@ -422,10 +422,10 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
 	if (rlen == blocksize)
 		count_offset = 8;
 	else if (rlen == 12) {
-		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
-		if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
+		de = (struct ext4_dir_entry_2 *)(((void *)dirent) + 12);
+		if (ext4_rec_len_from_disk(de->rec_len, blocksize) != blocksize - 12)
 			return NULL;
-		root = (struct dx_root_info *)(((void *)dp + 12));
+		root = (struct dx_root_info *)(((void *)de + 12));
 		if (root->reserved_zero ||
 		    root->info_length != sizeof(struct dx_root_info))
 			return NULL;
@@ -438,7 +438,7 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
 	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
 }
 
-static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry_2 *dirent,
 			   int count_offset, int count, struct dx_tail *t)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -456,7 +456,7 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
 }
 
 static int ext4_dx_csum_verify(struct inode *inode,
-			       struct ext4_dir_entry *dirent)
+			       struct ext4_dir_entry_2 *dirent)
 {
 	struct dx_countlimit *c;
 	struct dx_tail *t;
@@ -485,7 +485,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
 	return 1;
 }
 
-static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *dirent)
 {
 	struct dx_countlimit *c;
 	struct dx_tail *t;
@@ -515,7 +515,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
 					    struct inode *inode,
 					    struct buffer_head *bh)
 {
-	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	ext4_dx_csum_set(inode, (struct ext4_dir_entry_2 *)bh->b_data);
 	return ext4_handle_dirty_metadata(handle, inode, bh);
 }
 
@@ -1488,7 +1488,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
 }
 
 static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
-			       struct ext4_dir_entry *de)
+			       struct ext4_dir_entry_2 *de)
 {
 	struct super_block *sb = dir->i_sb;
 
@@ -1619,7 +1619,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
 		}
 		if (!buffer_verified(bh) &&
 		    !is_dx_internal_node(dir, block,
-					 (struct ext4_dir_entry *)bh->b_data) &&
+					 (struct ext4_dir_entry_2 *)bh->b_data) &&
 		    !ext4_dirblock_csum_verify(dir, bh)) {
 			EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
 					     "checksumming directory "
-- 
2.43.7


^ permalink raw reply related

* [PATCH] ext4: Remove mention of PageWriteback
From: Matthew Wilcox (Oracle) @ 2026-05-26 19:08 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Matthew Wilcox (Oracle), Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel

Update a comment to refer to the concept of writeback instead of the
(now obsolete) detail of how it's implemented.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/ext4/page-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dc82e7b57e75..bc674aa4a656 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
  * written. On IO failure, check if journal abort is needed. Note that
  * we are protected from truncate touching same part of extent tree by the
  * fact that truncate code waits for all DIO to finish (thus exclusion from
- * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * direct IO is achieved) and also waits for writeback to complete. Thus we
  * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
  * completed (happens from ext4_free_ioend()).
  */
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH v4 04/23] ext4: add iomap address space operations for buffered I/O
From: Ojaswin Mujoo @ 2026-05-26 17:11 UTC (permalink / raw)
  To: Zhang Yi
  Cc: Zhang Yi, linux-ext4, linux-fsdevel, linux-kernel, tytso,
	adilger.kernel, libaokun, jack, ritesh.list, djwong, hch,
	yi.zhang, yangerkun, yukuai
In-Reply-To: <51b3f8d5-b90f-49fe-b93e-171268db9ff8@gmail.com>

On Wed, May 20, 2026 at 10:49:50AM +0800, Zhang Yi wrote:
> On 5/20/2026 12:53 AM, Ojaswin Mujoo wrote:
> > On Tue, May 19, 2026 at 08:35:51PM +0800, Zhang Yi wrote:
> > > On 5/19/2026 5:28 PM, Ojaswin Mujoo wrote:
> > > > On Mon, May 11, 2026 at 03:23:24PM +0800, Zhang Yi wrote:
> > > > > From: Zhang Yi <yi.zhang@huawei.com>
> > > > > 
> > > > > Introduce initial support for iomap in the buffered I/O path for regular
> > > > > files on ext4.
> > > > > 
> > > > >    - Add a new inode state flag EXT4_STATE_BUFFERED_IOMAP to indicate the
> > > > >      inode uses iomap instead of buffer_head for buffered I/O
> > > > >    - Add helper ext4_inode_buffered_iomap() to check the flag
> > > > >    - Add new address space operations ext4_iomap_aops with callbacks that
> > > > >      will use generic iomap implementations
> > > > >    - Add ext4_iomap_aops to ext4_set_aops() when the flag is set
> > > > > 
> > > > > The following callbacks(read_folio(), readahead(), writepages()) are
> > > > > provided as placeholders and will be implemented in later patches.
> > > > > 
> > > > > Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> > > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > 
> > > > Hi Zhang, looks good to me. Just a questions below:
> > > 
> > > Hi, Ojaswin! Thank you for the review of this series.
> > > 
> > > > > ---
> > > > >   fs/ext4/ext4.h  |  7 +++++++
> > > > >   fs/ext4/inode.c | 32 ++++++++++++++++++++++++++++++++
> > > > >   2 files changed, 39 insertions(+)
> > > > > 
> > > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > > > index 94283a991e5c..1e27d73d7427 100644
> > > > > --- a/fs/ext4/ext4.h
> > > > > +++ b/fs/ext4/ext4.h
> > > > > @@ -1972,6 +1972,7 @@ enum {
> > > > >   	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
> > > > >   	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
> > > > >   	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
> > > > > +	EXT4_STATE_BUFFERED_IOMAP,	/* Inode use iomap for buffered IO */
> > > > >   };
> > > > >   #define EXT4_INODE_BIT_FNS(name, field, offset)				\
> > > > > @@ -2040,6 +2041,12 @@ static inline bool ext4_inode_orphan_tracked(struct inode *inode)
> > > > >   		!list_empty(&EXT4_I(inode)->i_orphan);
> > > > >   }
> > > > > +/* Whether the inode pass through the iomap infrastructure for buffered I/O */
> > > > > +static inline bool ext4_inode_buffered_iomap(struct inode *inode)
> > > > > +{
> > > > > +	return ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
> > > > > +}
> > > > > +
> > > > >   /*
> > > > >    * Codes for operating systems
> > > > >    */
> > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > > index b1ef706987c3..178ac2be37b7 100644
> > > > > --- a/fs/ext4/inode.c
> > > > > +++ b/fs/ext4/inode.c
> > > > > @@ -3908,6 +3908,22 @@ const struct iomap_ops ext4_iomap_report_ops = {
> > > > >   	.iomap_begin = ext4_iomap_begin_report,
> > > > >   };
> > > > > +static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
> > > > > +{
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static void ext4_iomap_readahead(struct readahead_control *rac)
> > > > > +{
> > > > > +
> > > > > +}
> > > > > +
> > > > > +static int ext4_iomap_writepages(struct address_space *mapping,
> > > > > +				 struct writeback_control *wbc)
> > > > > +{
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > >   /*
> > > > >    * For data=journal mode, folio should be marked dirty only when it was
> > > > >    * writeably mapped. When that happens, it was already attached to the
> > > > > @@ -3994,6 +4010,20 @@ static const struct address_space_operations ext4_da_aops = {
> > > > >   	.swap_activate		= ext4_iomap_swap_activate,
> > > > >   };
> > > > > +static const struct address_space_operations ext4_iomap_aops = {
> > > > > +	.read_folio		= ext4_iomap_read_folio,
> > > > > +	.readahead		= ext4_iomap_readahead,
> > > > > +	.writepages		= ext4_iomap_writepages,
> > > > > +	.dirty_folio		= iomap_dirty_folio,
> > > > > +	.bmap			= ext4_bmap,
> > > > > +	.invalidate_folio	= iomap_invalidate_folio,
> > > > > +	.release_folio		= iomap_release_folio,
> > > > > +	.migrate_folio		= filemap_migrate_folio,
> > > > > +	.is_partially_uptodate  = iomap_is_partially_uptodate,
> > > > > +	.error_remove_folio	= generic_error_remove_folio,
> > > > > +	.swap_activate		= ext4_iomap_swap_activate,
> > > > > +};
> > > > 
> > > > So one question, for ->release_folio() we are using
> > > > iomap_release_folio() instead of ext4_release_folio() here which doesnt
> > > > make the jbd2_journal_try_to_free_bufferes() call. IIUC this function
> > > > seems to be trying to clean up already checkpointed buffers.
> > > > 
> > > > I wanted to check if ->release_folio() can be called for folios with
> > > > ext4 metadata buffers? (from my limited understanding of
> > > > shrink_folio_list() -> filemap_release_folio() it seems we can) And if
> > > > it can be called, is it okay to skip the
> > > > jbd2_journal_try_to_free_buffers call?
> > > 
> > > Here, in ->release_folio(), folio->mapping points to inode->i_data (the
> > > file's pagecache), not the block device's pagecache. ext4 metadata
> > > resides in the block device's pagecache, which is at a different layer
> > > than this release_folio callback. So we don't need to call
> > > jbd2_journal_try_to_free_buffers() in the iomap path here.
> > 
> > Hi Yi,
> > 
> > Thanks for clarify and yes, thats what I was missing! So this
> > ->release_folio() is only for data folios. So I guess the
> > jbd2_journal_try_to_free_buffers() is mostly to handle data=journal
> > case?
> 
> Yes, that's my understanding as well. Meanwhile, the comment for the
> jbd2_journal_try_to_free_buffers() function looks quite outdated and
> needs to be updated.

Looks good, thanks for explanation and fixing it.

Regards,
ojaswin

> 
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..239bcf88ed1c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct
> journal_head *jh)
>  }
> 
>  /**
> - * jbd2_journal_try_to_free_buffers() - try to free page buffers.
> + * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
>   * @journal: journal for operation
>   * @folio: Folio to detach data from.
>   *
> - * For all the buffers on this page,
> - * if they are fully written out ordered data, move them onto BUF_CLEAN
> - * so try_to_free_buffers() can reap them.
> + * For each buffer_head on @folio, if the buffer has a journal_head but
> + * is not attached to a running or committing transaction, try to remove
> + * it from the checkpoint list.  This is needed for data=journal mode
> + * where data buffers are journaled: once they are checkpointed, the
> + * journal_head can be detached and the buffer freed.  If any buffer is
> + * still attached to a transaction, the folio cannot be released and we
> + * bail out.  Otherwise we call try_to_free_buffers() to detach all
> + * buffer_heads from the folio.
>   *
> - * This function returns non-zero if we wish try_to_free_buffers()
> - * to be called. We do this if the page is releasable by
> try_to_free_buffers().
> - * We also do it if the page has locked or dirty buffers and the caller
> wants
> - * us to perform sync or async writeout.
> + * For data=ordered and writeback modes, data buffers never have
> + * journal_heads, so this degenerates to a plain try_to_free_buffers().
>   *
> - * This complicates JBD locking somewhat.  We aren't protected by the
> - * BKL here.  We wish to remove the buffer from its committing or
> - * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
> - *
> - * This may *change* the value of transaction_t->t_datalist, so anyone
> - * who looks at t_datalist needs to lock against this function.
> - *
> - * Even worse, someone may be doing a jbd2_journal_dirty_data on this
> - * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
> - * will come out of the lock with the buffer dirty, which makes it
> - * ineligible for release here.
> - *
> - * Who else is affected by this?  hmm...  Really the only contender
> - * is do_get_write_access() - it could be looking at the buffer while
> - * journal_try_to_free_buffer() is changing its state.  But that
> - * cannot happen because we never reallocate freed data as metadata
> - * while the data is part of a transaction.  Yes?
> - *
> - * Return false on failure, true on success
> + * Return: true if the folio's buffers were freed, false otherwise
>   */
>  bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio
> *folio)
>  {
> 
> Thanks,
> Yi.
> 
> 
> 

^ permalink raw reply

* Re: [PATCH v4 08/23] ext4: implement buffered write path using iomap
From: Ojaswin Mujoo @ 2026-05-26 17:10 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-9-yi.zhang@huaweicloud.com>

On Mon, May 11, 2026 at 03:23:28PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Introduce two new iomap_ops instances for ext4 buffered writes:
> 
>  - ext4_iomap_buffered_da_write_ops: for delayed allocation mode, using
>    ext4_da_map_blocks() to map delalloc extents.
>  - ext4_iomap_buffered_write_ops: for non-delayed allocation mode, using
>    ext4_iomap_get_blocks() to directly allocate blocks.
> 
> Also add ext4_iomap_valid() for the iomap infrastructure to check extent
> validity.
> 
> Key changes and considerations:
> 
>  - Unwritten extents for new blocks (dioread_nolock always on)
>    Since data=ordered mode is not used to prevent stale data exposure in
>    the non-delayed allocation path, new blocks are always allocated as
>    unwritten extents.

Okay makes sense.

> 
>  - Short write and write failure handling
>    a. Delalloc path: On short write or failure, the stale delalloc range
>       must be dropped and its space reservation released. Otherwise, a
>       clean folio may cover leftover delalloc extents, causing
>       inaccurate space reservation accounting.

Hmm, okay so in the usual buffer head path, seems like during a short
write we still zero the new buffers we couldn't write and keep it dirty
(folio_zero_new_buffers()). This way they are still written back and
the delalloc reservations are used up.

However in iomap we don't mark the range that we couldnt write as dirty
so we need to make sure we clear up the stale delalloc mappings. Is this
correct?

Regards,
Ojaswin

>    b. Non-delalloc path: No cleanup of allocated blocks is needed on
>       short write.
> 
>  - Lock ordering reversal
>    The folio lock and transaction start ordering is reversed compared to
>    the buffer_head buffered write path. To handle this, the journal
>    handle must be stopped in iomap_begin() callbacks. The lock ordering
>    documentation in super.c has been updated accordingly.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h  |   4 ++
>  fs/ext4/file.c  |  20 +++++-
>  fs/ext4/inode.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/ext4/super.c |  10 ++-
>  4 files changed, 192 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1e27d73d7427..4832e7f7db82 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>  				struct buffer_head *bh);
>  void ext4_set_inode_mapping_order(struct inode *inode);
> +int ext4_nonda_switch(struct super_block *sb);
>  #define FALL_BACK_TO_NONDELALLOC 1
>  #define CONVERT_INLINE_DATA	 2
>  
> @@ -3926,6 +3927,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>  
>  extern const struct iomap_ops ext4_iomap_ops;
>  extern const struct iomap_ops ext4_iomap_report_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
> +extern const struct iomap_write_ops ext4_iomap_write_ops;
>  
>  static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>  {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index eb1a323962b1..7f9bfbbc4a4e 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	return count;
>  }
>  
> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
> +					 struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	const struct iomap_ops *iomap_ops;
> +
> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
> +	else
> +		iomap_ops = &ext4_iomap_buffered_write_ops;
> +
> +	return iomap_file_buffered_write(iocb, from, iomap_ops,
> +					 &ext4_iomap_write_ops, NULL);
> +}
> +
>  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  					struct iov_iter *from)
>  {
> @@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  	if (ret <= 0)
>  		goto out;
>  
> -	ret = generic_perform_write(iocb, from);
> +	if (ext4_inode_buffered_iomap(inode))
> +		ret = ext4_iomap_buffered_write(iocb, from);
> +	else
> +		ret = generic_perform_write(iocb, from);
>  
>  out:
>  	inode_unlock(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 39577a6b65b9..1ae7d3f4a1c8 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>  	return ret;
>  }
>  
> -static int ext4_nonda_switch(struct super_block *sb)
> +int ext4_nonda_switch(struct super_block *sb)
>  {
>  	s64 free_clusters, dirty_clusters;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>  	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
>  }
>  
> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
> +{
> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
> +}
> +
> +const struct iomap_write_ops ext4_iomap_write_ops = {
> +	.iomap_valid = ext4_iomap_valid,
> +};
> +
>  static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  			   struct ext4_map_blocks *map, loff_t offset,
>  			   loff_t length, unsigned int flags)
> @@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>  		iomap->flags |= IOMAP_F_MERGED;
>  
> +	iomap->validity_cookie = map->m_seq;
> +
>  	/*
>  	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>  	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
> @@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
>  	.iomap_begin = ext4_iomap_begin_report,
>  };
>  
> +/* Map blocks */
> +typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
> +
>  static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
> -		loff_t length, struct ext4_map_blocks *map)
> +		loff_t length, ext4_get_blocks_t get_blocks,
> +		struct ext4_map_blocks *map)
>  {
>  	u8 blkbits = inode->i_blkbits;
>  
> @@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>  	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>  			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
>  
> +	if (get_blocks)
> +		return get_blocks(inode, map);
> +
>  	return ext4_map_blocks(NULL, inode, map, 0);
>  }
>  
> @@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>  	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>  		return -ERANGE;
>  
> -	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
>  	if (ret < 0)
>  		return ret;
>  
> @@ -3946,6 +3964,147 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>  	return 0;
>  }
>  
> +static int ext4_iomap_get_blocks(struct inode *inode,
> +				 struct ext4_map_blocks *map)
> +{
> +	loff_t i_size = i_size_read(inode);
> +	handle_t *handle;
> +	int ret;
> +
> +	/*
> +	 * Check if the blocks have already been allocated, this could
> +	 * avoid initiating a new journal transaction and return the
> +	 * mapping information directly.
> +	 */
> +	if ((map->m_lblk + map->m_len) <=
> +	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
> +		ret = ext4_map_blocks(NULL, inode, map, 0);
> +		if (ret < 0)
> +			return ret;
> +		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
> +				    EXT4_MAP_DELAYED))
> +			return 0;
> +	}
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
> +			ext4_chunk_trans_blocks(inode, map->m_len));
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +
> +	ret = ext4_map_blocks(handle, inode, map,
> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> +	/*
> +	 * Stop handle here following the lock ordering of the folio lock
> +	 * and the transaction start.
> +	 */
> +	ext4_journal_stop(handle);
> +
> +	return ret;
> +}
> +
> +static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
> +		loff_t offset, loff_t length, unsigned int flags,
> +		struct iomap *iomap, struct iomap *srcmap, bool delalloc)
> +{
> +	int ret, retries = 0;
> +	struct ext4_map_blocks map;
> +	ext4_get_blocks_t *get_blocks;
> +
> +	ret = ext4_emergency_state(inode->i_sb);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	/* Inline data and non-extent are not supported. */
> +	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
> +		return -ERANGE;
> +	if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
> +		return -EINVAL;
> +	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
> +		return -EINVAL;
> +
> +	if (delalloc)
> +		get_blocks = ext4_da_map_blocks;
> +	else
> +		get_blocks = ext4_iomap_get_blocks;
> +retry:
> +	ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
> +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> +		goto retry;
> +	if (ret < 0)
> +		return ret;
> +
> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
> +	return 0;
> +}
> +
> +static int ext4_iomap_buffered_write_begin(struct inode *inode,
> +		loff_t offset, loff_t length, unsigned int flags,
> +		struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
> +						  iomap, srcmap, false);
> +}
> +
> +static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
> +		loff_t offset, loff_t length, unsigned int flags,
> +		struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
> +						  iomap, srcmap, true);
> +}
> +
> +/*
> + * On write failure, drop the stale delayed allocation range and release
> + * its reserved space for both start and end blocks. Otherwise, we may
> + * leave a range of delayed extents covered by a clean folio, which can
> + * result in inaccurate space reservation accounting.
> + */
> +static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> +				     loff_t length, struct iomap *iomap)
> +{
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
> +	up_write(&EXT4_I(inode)->i_data_sem);
> +}
> +
> +static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
> +					    loff_t length, ssize_t written,
> +					    unsigned int flags,
> +					    struct iomap *iomap)
> +{
> +	loff_t start_byte, end_byte;
> +
> +	/* If we didn't reserve the blocks, we're not allowed to punch them. */
> +	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
> +		return 0;
> +
> +	/* Nothing to do if we've written the entire delalloc extent */
> +	start_byte = iomap_last_written_block(inode, offset, written);
> +	end_byte = round_up(offset + length, i_blocksize(inode));
> +	if (start_byte >= end_byte)
> +		return 0;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
> +				     iomap, ext4_iomap_punch_delalloc);
> +	filemap_invalidate_unlock(inode->i_mapping);
> +	return 0;
> +}
> +
> +/*
> + * Since we always allocate unwritten extents, there is no need for
> + * iomap_end to clean up allocated blocks on a short write.
> + */
> +const struct iomap_ops ext4_iomap_buffered_write_ops = {
> +	.iomap_begin = ext4_iomap_buffered_write_begin,
> +};
> +
> +const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
> +	.iomap_begin = ext4_iomap_buffered_da_write_begin,
> +	.iomap_end = ext4_iomap_buffered_da_write_end,
> +};
> +
>  const struct iomap_ops ext4_iomap_buffered_read_ops = {
>  	.iomap_begin = ext4_iomap_buffered_read_begin,
>  };
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 6a77db4d3124..9bc294b769db 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
>   *   -> page lock -> i_data_sem (rw)
>   *
>   * buffered write path:
> - * sb_start_write -> i_mutex -> mmap_lock
> - * sb_start_write -> i_mutex -> transaction start -> page lock ->
> - *   i_data_sem (rw)
> + * sb_start_write -> i_rwsem (w) -> mmap_lock
> + * - buffer_head path:
> + *   sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
> + *     i_data_sem (rw)
> + * - iomap path:
> + *   sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
> + *   sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
>   *
>   * truncate:
>   * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
> -- 
> 2.52.0
> 

^ permalink raw reply

* Re: [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Ojaswin Mujoo @ 2026-05-26 17:02 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, yi.zhang, yizhang089, yangerkun,
	yukuai
In-Reply-To: <20260522030540.3896201-1-yi.zhang@huaweicloud.com>

On Fri, May 22, 2026 at 11:05:40AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> jbd2_journal_try_to_free_buffers() currently only tries to remove
> checkpointed data buffers from the checkpoint list for data=journal
> mode, and bails out if any buffer is still attached to a transaction.
> For data=ordered and writeback modes, data buffers never have
> journal_heads, so the function degenerates to a plain
> try_to_free_buffers() call.
> 
> Besides, The release of metadata buffers has been delegated to the jbd2
> journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
> release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
> not used for handling metadata buffers now.
> 
> However, the comment above the function still references
> jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
> BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
> ("jbd2: Remove data=ordered mode support using jbd buffer heads").
> 
> Replace it with a description of what the function actually does now.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good Zhang, feel free to add:

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
ojaswin

^ permalink raw reply

* Re: [PATCH v5 03/10] fstests: add test for inotify isolation on cloned devices
From: Anand Jain @ 2026-05-26 15:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Anand Jain, fstests, linux-btrfs, linux-ext4, linux-xfs, amir73il,
	zlang
In-Reply-To: <ahU9hpwrXyZChw9X@infradead.org>



On 26/5/26 14:28, Christoph Hellwig wrote:
> On Mon, May 25, 2026 at 04:35:58PM +0800, Anand Jain wrote:
>>> Also any reason to rely on the obsolete inotify instead of fsnotify?
>>
>> fsnotify is exercised in patch 4/10.
>> IMO, exercising inotify ensures we don't break legacy stuff.
> 
> fanotity and inotify use exactly the same backends, so I'm not sure
> why testing both matters. 

I noticed that back then I decided to keep both.
Since fsnotifywait -F is only supported from kernel 5.1x.
inotifywait serves the legacy LTS kernels where required.


> Not that I care very strongly, I'm just a
> bit confused.

I am happy dropping inotifywait if unnecessary and confusing.

Thanks.

^ permalink raw reply

* [PATCH 8/8] super: convert iterators to RCU readers + refcount_inc_not_zero
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

Walk @super_blocks and @fs_supers under rcu_read_lock() and pin the
current entry with refcount_inc_not_zero() instead of holding sb_lock
across the cursor advance. sb_lock was only there to keep the
cursor's ->next / ->prev pointer from being mutated by concurrent
list_del / list_add. RCU semantics give us that guarantee directly:
list_bidir_del_rcu() preserves both ->next and ->prev on the
unlinked entry and list_add_tail_rcu() publishes new entries with
the release barrier set up by the previous patch.

The pattern at each iterator is:

    rcu_read_lock();
    list_for_each_entry_rcu(sb, ...) {
            if (SB_DYING)                             continue;
            if (!refcount_inc_not_zero(&sb->s_count)) continue;
            rcu_read_unlock();

            ...                       /* may sleep on s_umount */

            if (prev)
                    put_super(prev);
            prev = sb;
            rcu_read_lock();          /* prev pinned: prev->{next,prev} valid */
    }
    rcu_read_unlock();
    if (prev)
            put_super(prev);

While we hold a pin on @prev, __put_super() cannot reach the
refcount_dec_and_test() transition that drives list_bidir_del_rcu().
So @prev stays on the list and concurrent list_bidir_del_rcu() of
other entries keeps @prev->s_list.{next,prev} pointing at the still-
live neighbour (or the head sentinel). The cursor advance after
re-acquiring rcu_read_lock() is therefore always against a live
chain in whichever direction we're walking.

put_super() now appears in the middle of the loop where __put_super()
used to be called with sb_lock held. It briefly takes sb_lock for
the trailing-ref drop; in the common case dec_and_test() returns
false and the lock is held for only a handful of cycles.

first_super() and next_super() switch the forward arm to READ_ONCE()
on the head and cursor ->next pointers and the reverse arm to
rcu_dereference(list_bidir_prev_rcu(...)). The forward arm matches
the semantics of list_entry_rcu() used internally by
list_for_each_entry_rcu(); the reverse arm is the canonical
bidirectional-RCU traversal pattern (see kernel/nstree.c) and is
needed because filesystems_freeze() and do_emergency_remount() pass
SUPER_ITER_REVERSE.

iterate_supers_type() and user_get_super() get the same treatment.
user_get_super() simplifies further: on lookup hit we return with
the pin; on lookup miss followed by SB_DYING discovery we put_super()
and return NULL.

sget_fc() and grab_super() are not touched here.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 71 +++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 8c01b95be717..d9b1148f7030 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -831,17 +831,25 @@ enum super_iter_flags_t {
 
 static inline struct super_block *first_super(enum super_iter_flags_t flags)
 {
+	struct list_head *next;
+
 	if (flags & SUPER_ITER_REVERSE)
-		return list_last_entry(&super_blocks, struct super_block, s_list);
-	return list_first_entry(&super_blocks, struct super_block, s_list);
+		next = rcu_dereference(list_bidir_prev_rcu(&super_blocks));
+	else
+		next = READ_ONCE(super_blocks.next);
+	return list_entry(next, struct super_block, s_list);
 }
 
 static inline struct super_block *next_super(struct super_block *sb,
 					     enum super_iter_flags_t flags)
 {
+	struct list_head *next;
+
 	if (flags & SUPER_ITER_REVERSE)
-		return list_prev_entry(sb, s_list);
-	return list_next_entry(sb, s_list);
+		next = rcu_dereference(list_bidir_prev_rcu(&sb->s_list));
+	else
+		next = READ_ONCE(sb->s_list.next);
+	return list_entry(next, struct super_block, s_list);
 }
 
 static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
@@ -850,15 +858,15 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 	struct super_block *sb, *p = NULL;
 	bool excl = flags & SUPER_ITER_EXCL;
 
-	guard(spinlock)(&sb_lock);
-
+	rcu_read_lock();
 	for (sb = first_super(flags);
 	     !list_entry_is_head(sb, &super_blocks, s_list);
 	     sb = next_super(sb, flags)) {
 		if (super_flags(sb, SB_DYING))
 			continue;
-		refcount_inc(&sb->s_count);
-		spin_unlock(&sb_lock);
+		if (!refcount_inc_not_zero(&sb->s_count))
+			continue;
+		rcu_read_unlock();
 
 		if (flags & SUPER_ITER_UNLOCKED) {
 			f(sb, arg);
@@ -867,13 +875,14 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 			super_unlock(sb, excl);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 	if (p)
-		__put_super(p);
+		put_super(p);
 }
 
 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
@@ -895,15 +904,15 @@ void iterate_supers_type(struct file_system_type *type,
 {
 	struct super_block *sb, *p = NULL;
 
-	spin_lock(&sb_lock);
-	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(sb, &type->fs_supers, s_instances) {
 		bool locked;
 
 		if (super_flags(sb, SB_DYING))
 			continue;
-
-		refcount_inc(&sb->s_count);
-		spin_unlock(&sb_lock);
+		if (!refcount_inc_not_zero(&sb->s_count))
+			continue;
+		rcu_read_unlock();
 
 		locked = super_lock_shared(sb);
 		if (locked) {
@@ -911,14 +920,14 @@ void iterate_supers_type(struct file_system_type *type,
 			super_unlock_shared(sb);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 	if (p)
-		__put_super(p);
-	spin_unlock(&sb_lock);
+		put_super(p);
 }
 
 EXPORT_SYMBOL(iterate_supers_type);
@@ -927,25 +936,21 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 {
 	struct super_block *sb;
 
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		bool locked;
-
+	rcu_read_lock();
+	list_for_each_entry_rcu(sb, &super_blocks, s_list) {
 		if (sb->s_dev != dev)
 			continue;
+		if (!refcount_inc_not_zero(&sb->s_count))
+			continue;
+		rcu_read_unlock();
 
-		refcount_inc(&sb->s_count);
-		spin_unlock(&sb_lock);
-
-		locked = super_lock(sb, excl);
-		if (locked)
+		if (super_lock(sb, excl))
 			return sb;
 
-		spin_lock(&sb_lock);
-		__put_super(sb);
-		break;
+		put_super(sb);
+		return NULL;
 	}
-	spin_unlock(&sb_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH 7/8] super: switch list manipulation to _rcu primitives
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

Swap the list/hlist write-side operations on @super_blocks and
@fs_type->fs_supers over to their _rcu variants. All three call sites
still hold sb_lock; this is a purely mechanical change that
establishes the writer-side memory ordering lockless RCU readers can
rely on in the next patch.

The affected sites are sget_fc() (list_add_tail() and
hlist_add_head() at the publish step), __put_super()
(list_del_init() -> list_bidir_del_rcu() of s_list when the last
temporary reference is dropped) and kill_super_notify()
(hlist_del_init() -> hlist_del_rcu() of s_instances).

@super_blocks gets list_bidir_del_rcu() rather than list_del_rcu()
because the next patch walks the list backward for
filesystems_freeze() and do_emergency_remount(). list_del_rcu()
preserves the unlinked entry's ->next pointer but poisons ->prev with
LIST_POISON2, which would crash any concurrent reverse traversal that
landed on the just-unlinked entry between the SB_DYING check and the
cursor advance. list_bidir_del_rcu() preserves both ->next and
->prev so reverse traversal stays safe. See kernel/nstree.c for the
canonical bidirectional-RCU list pattern.

The "_init" half of the deletions is not used elsewhere on these list
nodes after removal so dropping it is fine. The entry is about to be
freed via call_rcu(destroy_super_rcu) (for s_list) or to disappear
with the superblock (for s_instances, once the list has done its job
notifying SB_DEAD waiters).

Iterators keep using plain list_for_each_entry() and
hlist_for_each_entry() under sb_lock. Their conversion to lockless
RCU traversal with refcount_inc_not_zero() is the next patch.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 2fa7023010ec..8c01b95be717 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -407,7 +407,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 static void __put_super(struct super_block *s)
 {
 	if (refcount_dec_and_test(&s->s_count)) {
-		list_del_init(&s->s_list);
+		list_bidir_del_rcu(&s->s_list);
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(s->s_mounts);
@@ -445,7 +445,7 @@ static void kill_super_notify(struct super_block *sb)
 	 * SB_DEAD.
 	 */
 	spin_lock(&sb_lock);
-	hlist_del_init(&sb->s_instances);
+	hlist_del_rcu(&sb->s_instances);
 	spin_unlock(&sb_lock);

 	/*
@@ -784,8 +784,8 @@ struct super_block *sget_fc(struct fs_context *fc,
 	 * It's in a nascent state and users should wait on SB_BORN or
 	 * SB_DYING to be set.
 	 */
-	list_add_tail(&s->s_list, &super_blocks);
-	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
+	list_add_tail_rcu(&s->s_list, &super_blocks);
+	hlist_add_head_rcu(&s->s_instances, &s->s_type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(s->s_type);
 	shrinker_register(s->s_shrink);

-- 
2.47.3

^ permalink raw reply related

* [PATCH 6/8] super: convert sb->s_count to refcount_t
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

s_count is the temporary-reference count used to pin a superblock
across the spinlock-to-rwsem hop in every iterator and in
grab_super(). It's a plain int incremented and decremented only under
sb_lock.

Convert it to refcount_t. No semantic change yet: every increment
still happens with sb_lock held, so observation of a live ref is
still serialised by the lock. The increments use refcount_inc()
rather than refcount_inc_not_zero() because every callsite is still
looking at an sb known to be live under sb_lock.

This prepares the ground for switching iterators to RCU readers in a
later patch, at which point refcount_inc_not_zero() becomes the right
primitive at the lockless pin sites.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c                     | 14 +++++++-------
 include/linux/fs/super_types.h |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index c451f689c7b3..2fa7023010ec 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -366,7 +366,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	spin_lock_init(&s->s_inode_wblist_lock);
 	fserror_mount(s);
 
-	s->s_count = 1;
+	refcount_set(&s->s_count, 1);
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
@@ -406,7 +406,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
  */
 static void __put_super(struct super_block *s)
 {
-	if (!--s->s_count) {
+	if (refcount_dec_and_test(&s->s_count)) {
 		list_del_init(&s->s_list);
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
@@ -528,7 +528,7 @@ static bool grab_super(struct super_block *sb)
 {
 	bool locked;
 
-	sb->s_count++;
+	refcount_inc(&sb->s_count);
 	spin_unlock(&sb_lock);
 	locked = super_lock_excl(sb);
 	if (locked) {
@@ -857,7 +857,7 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 	     sb = next_super(sb, flags)) {
 		if (super_flags(sb, SB_DYING))
 			continue;
-		sb->s_count++;
+		refcount_inc(&sb->s_count);
 		spin_unlock(&sb_lock);
 
 		if (flags & SUPER_ITER_UNLOCKED) {
@@ -902,7 +902,7 @@ void iterate_supers_type(struct file_system_type *type,
 		if (super_flags(sb, SB_DYING))
 			continue;
 
-		sb->s_count++;
+		refcount_inc(&sb->s_count);
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -934,7 +934,7 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev != dev)
 			continue;
 
-		sb->s_count++;
+		refcount_inc(&sb->s_count);
 		spin_unlock(&sb_lock);
 
 		locked = super_lock(sb, excl);
@@ -1368,7 +1368,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 
 	/* Make sure sb doesn't go away from under us */
 	spin_lock(&sb_lock);
-	sb->s_count++;
+	refcount_inc(&sb->s_count);
 	spin_unlock(&sb_lock);
 
 	mutex_unlock(&bdev->bd_holder_lock);
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 383050e7fdf5..3a8cc0c723a8 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -11,6 +11,7 @@
 #include <linux/uidgid.h>
 #include <linux/uuid.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
 #include <linux/workqueue_types.h>
 #include <linux/quota.h>
 
@@ -145,7 +146,7 @@ struct super_block {
 	unsigned long				s_magic;
 	struct dentry				*s_root;
 	struct rw_semaphore			s_umount;
-	int					s_count;
+	refcount_t				s_count;
 	atomic_t				s_active;
 #ifdef CONFIG_SECURITY
 	void					*s_security;

-- 
2.47.3


^ permalink raw reply related

* [PATCH 5/8] super: drop sb_lock from setup_bdev_super() tuple publication
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

The tuple {s_bdev_file, s_bdev, s_bdi, SB_I_STABLE_WRITES} written by
setup_bdev_super() is publication of immutable state, not list
integrity. The sb is already on @super_blocks and @fs_supers at this
point (sget_dev() -> sget_fc() put it there) but SB_BORN is unset, so
any iterator that calls super_lock() blocks on
wait_var_event(SB_BORN | SB_DYING).

The SUPER_ITER_UNLOCKED iterators (filesystems_freeze,
filesystems_thaw, do_emergency_remount) do not look at s_bdev, s_bdi
or s_iflags so they cannot observe a partial fill either.

When vfs_get_tree() later calls super_wake(sb, SB_BORN) it does

    smp_store_release(&sb->s_flags, sb->s_flags | SB_BORN)

and any reader gating on SB_BORN via super_flags() loads sb->s_flags
with smp_load_acquire(). The release/acquire pair orders the four
prior writes against the load of SB_BORN.

s_iflags is a shared field so use WRITE_ONCE() on the
read-modify-write to keep the compiler from tearing the store.
retire_super() is the only other writer of s_iflags and only runs
against an already-born sb under s_umount.

This drops one of the five sb_lock acquisitions in the mount path
with no behavioural change for any reader.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 5fe8cea9f8fe..c451f689c7b3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1576,13 +1576,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		bdev_fput(bdev_file);
 		return -EBUSY;
 	}
-	spin_lock(&sb_lock);
+	/*
+	 * Publish before SB_BORN is set. super_wake(sb, SB_BORN) below uses
+	 * smp_store_release(); any iterator that observes SB_BORN via
+	 * super_flags()'s smp_load_acquire() sees these writes.
+	 */
 	sb->s_bdev_file = bdev_file;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
-		sb->s_iflags |= SB_I_STABLE_WRITES;
-	spin_unlock(&sb_lock);
+		WRITE_ONCE(sb->s_iflags, sb->s_iflags | SB_I_STABLE_WRITES);

 	snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
 	shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,

-- 
2.47.3

^ permalink raw reply related

* [PATCH 4/8] fs: retire sget()
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

sget() and sget_fc() have lived side by side as near-duplicate
find-or-create-and-publish helpers for the legacy and fs_context mount
APIs. The three remaining in-tree callers (CIFS plus the ext4 extents
and mballoc KUnit tests) have all been moved to sget_fc(). Nothing
calls sget() anymore.

Delete sget() from fs/super.c and the prototype in <linux/fs.h>.
Update the two comments that referred to "sget()" or "sget{_fc}()" to
just say "sget_fc()".

This removes ~60 lines of code that only existed to be kept in
lockstep with sget_fc() on every superblock publish-path change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/super.c   |  2 +-
 fs/super.c         | 71 ++++--------------------------------------------------
 include/linux/fs.h |  4 ---
 3 files changed, 6 insertions(+), 71 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b26aa9169e83..636154861d7c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2052,7 +2052,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
 	 * then open_ctree will properly initialize the file system specific
 	 * settings later.  btrfs_init_fs_info initializes the static elements
 	 * of the fs_info (locks and such) to make cleanup easier if we find a
-	 * superblock with our given fs_devices later on at sget() time.
+	 * superblock with our given fs_devices later on at sget_fc() time.
 	 */
 	fs_info = kvzalloc_obj(struct btrfs_fs_info);
 	if (!fs_info)
diff --git a/fs/super.c b/fs/super.c
index 378e81efe643..5fe8cea9f8fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -328,7 +328,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
-	 * sget() can have s_umount recursion.
+	 * sget_fc() can have s_umount recursion.
 	 *
 	 * When it cannot find a suitable sb, it allocates a new
 	 * one (this one), and tries again to find a suitable old
@@ -439,7 +439,7 @@ static void kill_super_notify(struct super_block *sb)
 
 	/*
 	 * Remove it from @fs_supers so it isn't found by new
-	 * sget{_fc}() walkers anymore. Any concurrent mounter still
+	 * sget_fc() walkers anymore. Any concurrent mounter still
 	 * managing to grab a temporary reference is guaranteed to
 	 * already see SB_DYING and will wait until we notify them about
 	 * SB_DEAD.
@@ -517,7 +517,7 @@ EXPORT_SYMBOL(deactivate_super);
  * @sb: superblock to acquire
  *
  * Acquire a temporary reference on a superblock and try to trade it for
- * an active reference. This is used in sget{_fc}() to wait for a
+ * an active reference. This is used in sget_fc() to wait for a
  * superblock to either become SB_BORN or for it to pass through
  * sb->kill() and be marked as SB_DEAD.
  *
@@ -673,11 +673,11 @@ void generic_shutdown_super(struct super_block *sb)
 	/*
 	 * Broadcast to everyone that grabbed a temporary reference to this
 	 * superblock before we removed it from @fs_supers that the superblock
-	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+	 * is dying. Every walker of @fs_supers outside of sget_fc() will now
 	 * discard this superblock and treat it as dead.
 	 *
 	 * We leave the superblock on @fs_supers so it can be found by
-	 * sget{_fc}() until we passed sb->kill_sb().
+	 * sget_fc() until we passed sb->kill_sb().
 	 */
 	super_wake(sb, SB_DYING);
 	super_unlock_excl(sb);
@@ -808,67 +808,6 @@ struct super_block *sget_fc(struct fs_context *fc,
 }
 EXPORT_SYMBOL(sget_fc);
 
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	struct super_block *s = NULL;
-	struct super_block *old;
-	int err;
-
-retry:
-	spin_lock(&sb_lock);
-	if (test) {
-		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
-			if (!test(old, data))
-				continue;
-			if (user_ns != old->s_user_ns) {
-				spin_unlock(&sb_lock);
-				destroy_unused_super(s);
-				return ERR_PTR(-EBUSY);
-			}
-			if (!grab_super(old))
-				goto retry;
-			destroy_unused_super(s);
-			return old;
-		}
-	}
-	if (!s) {
-		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
-		if (!s)
-			return ERR_PTR(-ENOMEM);
-		goto retry;
-	}
-
-	err = set(s, data);
-	if (err) {
-		spin_unlock(&sb_lock);
-		destroy_unused_super(s);
-		return ERR_PTR(err);
-	}
-	s->s_type = type;
-	strscpy(s->s_id, type->name, sizeof(s->s_id));
-	list_add_tail(&s->s_list, &super_blocks);
-	hlist_add_head(&s->s_instances, &type->fs_supers);
-	spin_unlock(&sb_lock);
-	get_filesystem(type);
-	shrinker_register(s->s_shrink);
-	return s;
-}
-EXPORT_SYMBOL(sget);
-
 void drop_super(struct super_block *sb)
 {
 	super_unlock_shared(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..6dbe3218dc1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2327,10 +2327,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, void *data);
 struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */

-- 
2.47.3


^ permalink raw reply related

* [PATCH 3/8] smb: client: convert cifs_smb3_do_mount() to sget_fc()
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

The CIFS mount path already runs through fs_context: smb3_get_tree()
calls smb3_get_tree_common() with a struct fs_context * in hand. But
the fc is dropped on the way to sget(). Plumb it through to sget_fc()
so the legacy sget() interface can go.

cifs_smb3_do_mount() now takes (struct fs_context *, struct
smb3_fs_context *). The old (fs_type, flags) pair is reconstructed
from fc->fs_type and fc->sb_flags. The flags argument was always
passed as 0 by the sole caller anyway. The cifs_dbg diagnostic now
prints fc->sb_flags directly.

cifs_match_super() and cifs_set_super() were the two void-data
callbacks for sget(). The match callback now takes
(struct super_block *, struct fs_context *) and reads struct
cifs_mnt_data out of fc->sget_key. The set callback is gone entirely:
sget_fc() pre-populates sb->s_fs_info from fc->s_fs_info before
invoking set() so set_anon_super_fc() (which just allocates an anon
bdev) is sufficient.

Before sget_fc() we stash cifs_sb in fc->s_fs_info, the per-mount data
in fc->sget_key and force fc->sb_flags to SB_NODIRATIME | SB_NOATIME
to reproduce the previous hard-coded behaviour (alloc_super() reads
fc->sb_flags). The original sb_flags is saved and restored around the
call so the rest of the mount path sees the same fc semantics as
before.

mnt_data.flags keeps its historical value of 0 so the CIFS_MS_MASK
comparison in compare_mount_options() returns the same (always-equal)
result.

No functional change. With this in place sget() has no remaining CIFS
caller.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/smb/client/cifsfs.c     | 43 ++++++++++++++++++++++++++-----------------
 fs/smb/client/cifsfs.h     |  3 ++-
 fs/smb/client/cifsproto.h  |  3 ++-
 fs/smb/client/connect.c    |  5 +++--
 fs/smb/client/fs_context.c |  2 +-
 5 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9f76b0347fa9..d5074e3fbb85 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/filelock.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
@@ -966,26 +967,19 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
 	return dentry;
 }
 
-static int cifs_set_super(struct super_block *sb, void *data)
-{
-	struct cifs_mnt_data *mnt_data = data;
-	sb->s_fs_info = mnt_data->cifs_sb;
-	return set_anon_super(sb, NULL);
-}
-
 struct dentry *
-cifs_smb3_do_mount(struct file_system_type *fs_type,
-	      int flags, struct smb3_fs_context *old_ctx)
+cifs_smb3_do_mount(struct fs_context *fc, struct smb3_fs_context *old_ctx)
 {
 	struct cifs_mnt_data mnt_data;
 	struct cifs_sb_info *cifs_sb;
 	struct super_block *sb;
 	struct dentry *root;
+	unsigned int saved_sb_flags;
 	int rc;
 
 	if (cifsFYI) {
-		cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__,
-			 old_ctx->source, flags);
+		cifs_dbg(FYI, "%s: devname=%s sb_flags=0x%x\n", __func__,
+			 old_ctx->source, fc->sb_flags);
 	} else {
 		cifs_info("Attempting to mount %s\n", old_ctx->source);
 	}
@@ -1012,7 +1006,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, cifs_sb->ctx);
 	if (rc) {
-		if (!(flags & SB_SILENT))
+		if (!(fc->sb_flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -1021,12 +1015,27 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	mnt_data.ctx = cifs_sb->ctx;
 	mnt_data.cifs_sb = cifs_sb;
-	mnt_data.flags = flags;
+	mnt_data.flags = 0;
 
-	/* BB should we make this contingent on mount parm? */
-	flags |= SB_NODIRATIME | SB_NOATIME;
-
-	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+	/*
+	 * sb->s_flags is set from fc->sb_flags by alloc_super(). CIFS has
+	 * historically forced SB_NODIRATIME | SB_NOATIME on every mount and
+	 * ignored the caller-supplied SB_* flags. Preserve that behaviour by
+	 * overriding fc->sb_flags around the sget_fc() call.
+	 *
+	 * Hand cifs_sb to sget_fc() via fc->s_fs_info; sget_fc() copies it
+	 * onto sb->s_fs_info before running set() and clears fc->s_fs_info
+	 * on successful publish. Pass the rest of the per-mount context to
+	 * cifs_match_super() through fc->sget_key.
+	 */
+	saved_sb_flags = fc->sb_flags;
+	fc->sb_flags = SB_NODIRATIME | SB_NOATIME;
+	fc->s_fs_info = cifs_sb;
+	fc->sget_key = &mnt_data;
+	sb = sget_fc(fc, cifs_match_super, set_anon_super_fc);
+	fc->sget_key = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = saved_sb_flags;
 	if (IS_ERR(sb)) {
 		cifs_umount(cifs_sb);
 		return ERR_CAST(sb);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index c455b15f2778..0a93f48924a5 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -144,8 +144,9 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct file *src_file,
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg);
 void cifs_setsize(struct inode *inode, loff_t offset);
 
+struct fs_context;
 struct smb3_fs_context;
-struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, int flags,
+struct dentry *cifs_smb3_do_mount(struct fs_context *fc,
 				  struct smb3_fs_context *old_ctx);
 
 char *cifs_silly_fullpath(struct dentry *dentry);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4a25afda9448..a39572cbaadb 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -19,6 +19,7 @@
 struct statfs;
 struct smb_rqst;
 struct smb3_fs_context;
+struct fs_context;
 
 /*
  *****************************************************************
@@ -236,7 +237,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
 int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
-int cifs_match_super(struct super_block *sb, void *data);
+int cifs_match_super(struct super_block *sb, struct fs_context *fc);
 int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
 void cifs_umount(struct cifs_sb_info *cifs_sb);
 void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dcde25da468d..79762e6bbe50 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -6,6 +6,7 @@
  *
  */
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/net.h>
 #include <linux/string.h>
 #include <linux/sched/mm.h>
@@ -2991,9 +2992,9 @@ static int match_prepath(struct super_block *sb,
 }
 
 int
-cifs_match_super(struct super_block *sb, void *data)
+cifs_match_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct cifs_mnt_data *mnt_data = data;
+	struct cifs_mnt_data *mnt_data = fc->sget_key;
 	struct smb3_fs_context *ctx;
 	struct cifs_sb_info *cifs_sb;
 	struct TCP_Server_Info *tcp_srv;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b9544eb0381b..6aba4e1c9c27 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,7 +920,7 @@ static int smb3_get_tree_common(struct fs_context *fc)
 	struct dentry *root;
 	int rc = 0;
 
-	root = cifs_smb3_do_mount(fc->fs_type, 0, ctx);
+	root = cifs_smb3_do_mount(fc, ctx);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH 2/8] ext4: convert mballoc KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

Same treatment as the extents KUnit test. The mballoc test uses sget()
as a thin "give me an initialized superblock" wrapper for a fake
file_system_type. Move it onto sget_fc() so sget() can go away.

Add a no-op mbt_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. mbt_set() now takes a struct
fs_context * (still a no-op). mbt_ext4_alloc_super_block() allocates
the fc, hands it to sget_fc() and drops the fc reference once the sb
is published.

No functional change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/mballoc-test.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 90ed505fa4b1..d90da44aadbd 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/random.h>
 
 #include "ext4.h"
@@ -63,8 +64,14 @@ static void mbt_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
+static int mbt_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
 static struct file_system_type mbt_fs_type = {
 	.name			= "mballoc test",
+	.init_fs_context	= mbt_init_fs_context,
 	.kill_sb		= mbt_kill_sb,
 };
 
@@ -127,7 +134,7 @@ static void mbt_mb_release(struct super_block *sb)
 	kfree(sb->s_bdev);
 }
 
-static int mbt_set(struct super_block *sb, void *data)
+static int mbt_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
@@ -136,13 +143,19 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
 {
 	struct mbt_ext4_super_block *fsb;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi;
 
 	fsb = kzalloc_obj(*fsb);
 	if (fsb == NULL)
 		return NULL;
 
-	sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+	fc = fs_context_for_mount(&mbt_fs_type, 0);
+	if (IS_ERR(fc))
+		goto out;
+
+	sb = sget_fc(fc, NULL, mbt_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb))
 		goto out;
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH 1/8] ext4: convert extents KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

The extents KUnit test uses sget() to get an initialized superblock for
its fake file_system_type. sget() predates fs_context and we want to
retire it. Switch this caller over to sget_fc().

Add a no-op ext_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. ext_set() now takes a struct
fs_context * (still a no-op). extents_kunit_init() allocates the fc,
hands it to sget_fc() and drops the fc reference once the sb is
published. sget_fc() does not retain a pointer to it.

No functional change for the test.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/extents-test.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index 6b53a3f39fcd..bd7795a82607 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -37,6 +37,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/gfp_types.h>
 #include <linux/stddef.h>
 
@@ -130,14 +131,20 @@ static void ext_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
-static int ext_set(struct super_block *sb, void *data)
+static int ext_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
+static int ext_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
 
 static struct file_system_type ext_fs_type = {
-	.name = "extents test",
-	.kill_sb = ext_kill_sb,
+	.name		 = "extents test",
+	.init_fs_context = ext_init_fs_context,
+	.kill_sb	 = ext_kill_sb,
 };
 
 static void extents_kunit_exit(struct kunit *test)
@@ -223,6 +230,7 @@ static int extents_kunit_init(struct kunit *test)
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi = NULL;
 	struct kunit_ext_test_param *param =
 		(struct kunit_ext_test_param *)(test->param_value);
@@ -232,7 +240,13 @@ static int extents_kunit_init(struct kunit *test)
 	if (sbi == NULL)
 		return -ENOMEM;
 
-	sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL);
+	fc = fs_context_for_mount(&ext_fs_type, 0);
+	if (IS_ERR(fc)) {
+		kfree(sbi);
+		return PTR_ERR(fc);
+	}
+	sb = sget_fc(fc, NULL, ext_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb)) {
 		kfree(sbi);
 		return PTR_ERR(sb);

-- 
2.47.3


^ permalink raw reply related

* [PATCH 0/8] super: retire sget(), convert iterators to RCU
From: Christian Brauner @ 2026-05-26 15:09 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)

* retire sget(): CIFS plus the two ext4 KUnit tests (extents-test,
  mballoc-test) were the last in-tree callers, and all three convert
  cleanly to sget_fc(). That lets sget() and its prototype come out,
  taking ~60 lines that only existed to be kept in lockstep with
  sget_fc() on every publish-path change.

* Walk @super_blocks and @type->fs_supers under RCU, pinned by
  refcount_inc_not_zero(&sb->s_count). iterate_supers(),
  iterate_supers_type(), user_get_super(), do_emergency_remount(),
  filesystems_freeze() and filesystems_thaw() no longer hold sb_lock
  across the cursor advance.

  The conversion goes in four small steps. Drop sb_lock from
  setup_bdev_super(): the {s_bdev_file, s_bdev, s_bdi,
  SB_I_STABLE_WRITES} tuple is publication of immutable state, and
  SB_BORN already gates every reader via super_wake()'s
  smp_store_release paired with super_flags()'s smp_load_acquire. Then
  convert sb->s_count to refcount_t -- mechanical, every increment is
  still under sb_lock. Then switch the write-side list/hlist ops to
  their _rcu variants; @super_blocks gets list_bidir_del_rcu() so the
  reverse-walking iterators (filesystems_freeze, do_emergency_remount)
  keep a valid ->prev on the unlinked entry, matching the canonical
  pattern in kernel/nstree.c. Finally, convert the iterators themselves:
  cursor advance via READ_ONCE / rcu_dereference, with the previous
  entry kept pinned via its s_count across the rcu_read_unlock ->
  callback -> rcu_read_lock cycle.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Christian Brauner (8):
      ext4: convert extents KUnit test to sget_fc()
      ext4: convert mballoc KUnit test to sget_fc()
      smb: client: convert cifs_smb3_do_mount() to sget_fc()
      fs: retire sget()
      super: drop sb_lock from setup_bdev_super() tuple publication
      super: convert sb->s_count to refcount_t
      super: switch list manipulation to _rcu primitives
      super: convert iterators to RCU readers + refcount_inc_not_zero

 fs/btrfs/super.c               |   2 +-
 fs/ext4/extents-test.c         |  22 +++++-
 fs/ext4/mballoc-test.c         |  17 ++++-
 fs/smb/client/cifsfs.c         |  43 ++++++-----
 fs/smb/client/cifsfs.h         |   3 +-
 fs/smb/client/cifsproto.h      |   3 +-
 fs/smb/client/connect.c        |   5 +-
 fs/smb/client/fs_context.c     |   2 +-
 fs/super.c                     | 167 ++++++++++++++---------------------------
 include/linux/fs.h             |   4 -
 include/linux/fs/super_types.h |   3 +-
 11 files changed, 127 insertions(+), 144 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260526-work-sget-6bc80b96cba5

^ permalink raw reply

* Re: [PATCH 12/17] fuse: replace __get_free_page() with kmalloc()
From: Miklos Szeredi @ 2026-05-26 15:00 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian, linux-kernel, linux-fsdevel, ocfs2-devel,
	linux-nilfs, linux-nfs, jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <20260523-b4-fs-v1-12-275e36a83f0e@kernel.org>

On Sat, 23 May 2026 at 19:56, Mike Rapoport (Microsoft) <rppt@kernel.org> wrote:
>
> fuse_do_ioctl allocates memory for struct iov array using
> __get_free_page().
>
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
>
> Replace use of __get_free_page() with kmalloc().
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Acked-by: Miklos Szeredi <mszeredi@redhat.com>

Thanks,
Miklos

^ permalink raw reply

* Re: [PATCH v10 00/22] fs-verity support for XFS with post EOF merkle tree
From: Carlos Maiolino @ 2026-05-26 10:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Christian Brauner, Andrey Albershteyn, Andrey Albershteyn,
	linux-xfs, fsverity, linux-fsdevel, ebiggers, linux-ext4,
	linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong, david
In-Reply-To: <20260522120757.GA21041@lst.de>

On Fri, May 22, 2026 at 02:07:57PM +0200, Christoph Hellwig wrote:
> On Fri, May 22, 2026 at 12:03:20PM +0200, Christian Brauner wrote:
> > > I was expecting this to come through xfs tree too if Eric and Christian
> > > agree.
> > 
> > You may take it through the xfs tree if there are no conflicts with
> > vfs-7.2.iomap. If there are I want to add the iomap changes into
> > vfs-7.2.iomap that you can pull in.
> 
> Merging the iomap bits through the iomap branch might make sense, given
> that iomap usually tends to see quite a bit of activity.
> 

That sounds good to me. If you want to go ahead and pull in the iomap
bits, do so, and give me a heads up when you do it so I'll pull your
branch locally.

Cheers.

^ permalink raw reply

* Re: Re: [PATCH 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Bohdan Trach @ 2026-05-26 10:02 UTC (permalink / raw)
  To: Jan Kara
  Cc: Bohdan Trach, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel, lilith.oberhauser, bohdan.trach, mchehab+huawei
In-Reply-To: <wjkbwkkxz5ol3h5zo6npd7lrnzpvesox7cubor6t6gpx4hrdxa@utrefuc6apug>

On Mon, 25 May 2026 17:28:45, Jan Kara wrote:
> Good idea but do we really need the 'acquire' barrier here? I don't see
> anything that would really need this so I think
> EXT4_MB_GRP_TEST_AND_SET_READ() can be just:
> 
> test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state) || \
>   test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state)
> 
> or am I missing something?

Thanks for the review. Indeed, it should not be mandatory to use
test_bit_acquire(), I've used it to keep the original ordering
semantics of test_and_set_bit(), as I understood it from [1],
and thus of the whole macro.

I'll use plain test_bit() in the v2 series (+taking into account your
comment for the patch 2/2, and retest).

[1] https://docs.kernel.org/core-api/wrappers/atomic_bitops.html

--
With best regards,
Bohdan Trach


^ permalink raw reply

* Re: [PATCH 10/17] jbd2: replace __get_free_pages() with kmalloc()
From: David Laight @ 2026-05-26  9:35 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jan Kara, Mike Rapoport (Microsoft), Jan Kara, Mark Fasheh,
	Joel Becker, Joseph Qi, Ryusuke Konishi, Viacheslav Dubeyko,
	Trond Myklebust, Anna Schumaker, Chuck Lever, Jeff Layton,
	NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, Alexander Viro,
	Christian Brauner, Dave Kleikamp, Theodore Ts'o,
	Miklos Szeredi, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian, linux-kernel, linux-fsdevel, ocfs2-devel,
	linux-nilfs, linux-nfs, jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <ahSNFmwAA17pMy6o@casper.infradead.org>

On Mon, 25 May 2026 18:55:34 +0100
Matthew Wilcox <willy@infradead.org> wrote:

> On Mon, May 25, 2026 at 06:21:34PM +0100, David Laight wrote:
> > Would kvalloc() be more appropriate here?  
> 
> no
> 
> > Does __get_free_pages() return physically contiguous memory?  
> 
> yes
> 

Thankyou

^ permalink raw reply

* Re: [PATCH] jbd2: Remove special jbd2 slabs
From: Vlastimil Babka (SUSE) @ 2026-05-26  9:08 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle), Theodore Ts'o
  Cc: Jan Kara, linux-ext4, linux-fsdevel, Mike Rapoport (Microsoft)
In-Reply-To: <20260525201321.21717-1-willy@infradead.org>

On 5/25/26 10:13 PM, Matthew Wilcox (Oracle) wrote:
> When jbd2 was originally written, kmalloc() would not guarantee alignment
> for the requested memory.  Since commit 59bb47985c1d in 2019, kmalloc
> has guaranteed natural alignment for power-of-two allocations.  We can
> now remove the jbd2 special slabs and just use kmalloc() directly.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Nice!

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>




^ permalink raw reply

* Re: [PATCH 14/17] fs/namespace: use __getname() to allocate mntpath buffer
From: Mike Rapoport @ 2026-05-26  9:06 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <lwnrjpmzbv6swapmnmb5jki3xxxzqsxuks5vykniwhakvhqh7i@rhff3qrwfnoj>

On Mon, May 25, 2026 at 06:22:13PM +0200, Jan Kara wrote:
> On Sat 23-05-26 20:54:26, Mike Rapoport (Microsoft) wrote:
> > mnt_warn_timestamp_expiry() allocates memory for a path with
> > __get_free_page() although there is a dedicated helper for allocation of
> > file paths: __getname().
> > 
> > Replace __get_free_page() for allocation of a path buffer with __getname().
> > 
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
> >  fs/namespace.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index fe919abd2f01..2ed9cd846a81 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -3303,7 +3303,7 @@ static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
> >  	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
> >  		char *buf, *mntpath;
> >  
> > -		buf = (char *)__get_free_page(GFP_KERNEL);
> > +		buf = __getname();
> 
> Fair but d_path() below should then get PATH_MAX and not PAGE_SIZE.

Ack.
 
> >  		if (buf)
> >  			mntpath = d_path(mountpoint, buf, PAGE_SIZE);
> >  		else
> > @@ -3319,7 +3319,7 @@ static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
> >  
> >  		sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
> >  		if (buf)
> > -			free_page((unsigned long)buf);
> > +			__putname(buf);
> 
> And __putname() is fine with NULL so no need for the if (buf) check here.

Will fix.
 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH] jbd2: Remove special jbd2 slabs
From: Mike Rapoport @ 2026-05-26  8:49 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Theodore Ts'o, Jan Kara, linux-ext4,
	linux-fsdevel, Vlastimil Babka
In-Reply-To: <d5tqn5svrwdc3e4umqi3bcd66fnebptymib642advq2jazgzow@x2saq45uirv7>

On Tue, May 26, 2026 at 09:50:52AM +0200, Jan Kara wrote:
> On Mon 25-05-26 21:13:19, Matthew Wilcox (Oracle) wrote:
> > When jbd2 was originally written, kmalloc() would not guarantee alignment
> > for the requested memory.  Since commit 59bb47985c1d in 2019, kmalloc
> > has guaranteed natural alignment for power-of-two allocations.  We can
> > now remove the jbd2 special slabs and just use kmalloc() directly.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> 
> Very nice! So this replaces Mike's patch, doesn't it? Feel free to add:

Yeah, this is better :)
 
> Reviewed-by: Jan Kara <jack@suse.cz>

Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

> 
> 								Honza
> 
> > ---
> >  fs/jbd2/commit.c      |   8 ++-
> >  fs/jbd2/journal.c     | 121 ++----------------------------------------
> >  fs/jbd2/transaction.c |   8 +--
> >  include/linux/jbd2.h  |   3 --
> >  4 files changed, 11 insertions(+), 129 deletions(-)
> > 
> > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> > index 38f318bb4279..2e8dbc4547bb 100644
> > --- a/fs/jbd2/commit.c
> > +++ b/fs/jbd2/commit.c
> > @@ -514,10 +514,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> >  		 * leave undo-committed data.
> >  		 */
> >  		if (jh->b_committed_data) {
> > -			struct buffer_head *bh = jh2bh(jh);
> > -
> >  			spin_lock(&jh->b_state_lock);
> > -			jbd2_free(jh->b_committed_data, bh->b_size);
> > +			kfree(jh->b_committed_data);
> >  			jh->b_committed_data = NULL;
> >  			spin_unlock(&jh->b_state_lock);
> >  		}
> > @@ -978,7 +976,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> >  		 * its triggers if they exist, so we can clear that too.
> >  		 */
> >  		if (jh->b_committed_data) {
> > -			jbd2_free(jh->b_committed_data, bh->b_size);
> > +			kfree(jh->b_committed_data);
> >  			jh->b_committed_data = NULL;
> >  			if (jh->b_frozen_data) {
> >  				jh->b_committed_data = jh->b_frozen_data;
> > @@ -986,7 +984,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> >  				jh->b_frozen_triggers = NULL;
> >  			}
> >  		} else if (jh->b_frozen_data) {
> > -			jbd2_free(jh->b_frozen_data, bh->b_size);
> > +			kfree(jh->b_frozen_data);
> >  			jh->b_frozen_data = NULL;
> >  			jh->b_frozen_triggers = NULL;
> >  		}
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index a6616380ce38..ad10c8a92fa0 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
> >  EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
> >  EXPORT_SYMBOL(jbd2_inode_cache);
> >  
> > -static int jbd2_journal_create_slab(size_t slab_size);
> > -
> >  #ifdef CONFIG_JBD2_DEBUG
> >  void __jbd2_debug(int level, const char *file, const char *func,
> >  		  unsigned int line, const char *fmt, ...)
> > @@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
> >  			goto escape_done;
> >  
> >  		spin_unlock(&jh_in->b_state_lock);
> > -		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
> > +		tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
> >  		spin_lock(&jh_in->b_state_lock);
> >  		if (jh_in->b_frozen_data) {
> > -			jbd2_free(tmp, bh_in->b_size);
> > +			kfree(tmp);
> >  			goto copy_done;
> >  		}
> >  
> > @@ -2063,14 +2061,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
> >  int jbd2_journal_load(journal_t *journal)
> >  {
> >  	int err;
> > -	journal_superblock_t *sb = journal->j_superblock;
> > -
> > -	/*
> > -	 * Create a slab for this blocksize
> > -	 */
> > -	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
> > -	if (err)
> > -		return err;
> >  
> >  	/* Let the recovery code check whether it needs to recover any
> >  	 * data from the journal. */
> > @@ -2698,108 +2688,6 @@ size_t journal_tag_bytes(journal_t *journal)
> >  		return sz - sizeof(__u32);
> >  }
> >  
> > -/*
> > - * JBD memory management
> > - *
> > - * These functions are used to allocate block-sized chunks of memory
> > - * used for making copies of buffer_head data.  Very often it will be
> > - * page-sized chunks of data, but sometimes it will be in
> > - * sub-page-size chunks.  (For example, 16k pages on Power systems
> > - * with a 4k block file system.)  For blocks smaller than a page, we
> > - * use a SLAB allocator.  There are slab caches for each block size,
> > - * which are allocated at mount time, if necessary, and we only free
> > - * (all of) the slab caches when/if the jbd2 module is unloaded.  For
> > - * this reason we don't need to a mutex to protect access to
> > - * jbd2_slab[] allocating or releasing memory; only in
> > - * jbd2_journal_create_slab().
> > - */
> > -#define JBD2_MAX_SLABS 8
> > -static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
> > -
> > -static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
> > -	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
> > -	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
> > -};
> > -
> > -
> > -static void jbd2_journal_destroy_slabs(void)
> > -{
> > -	int i;
> > -
> > -	for (i = 0; i < JBD2_MAX_SLABS; i++) {
> > -		kmem_cache_destroy(jbd2_slab[i]);
> > -		jbd2_slab[i] = NULL;
> > -	}
> > -}
> > -
> > -static int jbd2_journal_create_slab(size_t size)
> > -{
> > -	static DEFINE_MUTEX(jbd2_slab_create_mutex);
> > -	int i = order_base_2(size) - 10;
> > -	size_t slab_size;
> > -
> > -	if (size == PAGE_SIZE)
> > -		return 0;
> > -
> > -	if (i >= JBD2_MAX_SLABS)
> > -		return -EINVAL;
> > -
> > -	if (unlikely(i < 0))
> > -		i = 0;
> > -	mutex_lock(&jbd2_slab_create_mutex);
> > -	if (jbd2_slab[i]) {
> > -		mutex_unlock(&jbd2_slab_create_mutex);
> > -		return 0;	/* Already created */
> > -	}
> > -
> > -	slab_size = 1 << (i+10);
> > -	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
> > -					 slab_size, 0, NULL);
> > -	mutex_unlock(&jbd2_slab_create_mutex);
> > -	if (!jbd2_slab[i]) {
> > -		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
> > -		return -ENOMEM;
> > -	}
> > -	return 0;
> > -}
> > -
> > -static struct kmem_cache *get_slab(size_t size)
> > -{
> > -	int i = order_base_2(size) - 10;
> > -
> > -	BUG_ON(i >= JBD2_MAX_SLABS);
> > -	if (unlikely(i < 0))
> > -		i = 0;
> > -	BUG_ON(jbd2_slab[i] == NULL);
> > -	return jbd2_slab[i];
> > -}
> > -
> > -void *jbd2_alloc(size_t size, gfp_t flags)
> > -{
> > -	void *ptr;
> > -
> > -	BUG_ON(size & (size-1)); /* Must be a power of 2 */
> > -
> > -	if (size < PAGE_SIZE)
> > -		ptr = kmem_cache_alloc(get_slab(size), flags);
> > -	else
> > -		ptr = (void *)__get_free_pages(flags, get_order(size));
> > -
> > -	/* Check alignment; SLUB has gotten this wrong in the past,
> > -	 * and this can lead to user data corruption! */
> > -	BUG_ON(((unsigned long) ptr) & (size-1));
> > -
> > -	return ptr;
> > -}
> > -
> > -void jbd2_free(void *ptr, size_t size)
> > -{
> > -	if (size < PAGE_SIZE)
> > -		kmem_cache_free(get_slab(size), ptr);
> > -	else
> > -		free_pages((unsigned long)ptr, get_order(size));
> > -};
> > -
> >  /*
> >   * Journal_head storage management
> >   */
> > @@ -2977,11 +2865,11 @@ static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
> >  {
> >  	if (jh->b_frozen_data) {
> >  		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
> > -		jbd2_free(jh->b_frozen_data, b_size);
> > +		kfree(jh->b_frozen_data);
> >  	}
> >  	if (jh->b_committed_data) {
> >  		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
> > -		jbd2_free(jh->b_committed_data, b_size);
> > +		kfree(jh->b_committed_data);
> >  	}
> >  	journal_free_journal_head(jh);
> >  }
> > @@ -3142,7 +3030,6 @@ static void jbd2_journal_destroy_caches(void)
> >  	jbd2_journal_destroy_handle_cache();
> >  	jbd2_journal_destroy_inode_cache();
> >  	jbd2_journal_destroy_transaction_cache();
> > -	jbd2_journal_destroy_slabs();
> >  }
> >  
> >  static int __init journal_init(void)
> > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> > index 4885903bbd10..48ddb566d12d 100644
> > --- a/fs/jbd2/transaction.c
> > +++ b/fs/jbd2/transaction.c
> > @@ -1131,7 +1131,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
> >  		if (!frozen_buffer) {
> >  			JBUFFER_TRACE(jh, "allocate memory for buffer");
> >  			spin_unlock(&jh->b_state_lock);
> > -			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
> > +			frozen_buffer = kmalloc(jh2bh(jh)->b_size,
> >  						   GFP_NOFS | __GFP_NOFAIL);
> >  			goto repeat;
> >  		}
> > @@ -1159,7 +1159,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
> >  
> >  out:
> >  	if (unlikely(frozen_buffer))	/* It's usually NULL */
> > -		jbd2_free(frozen_buffer, bh->b_size);
> > +		kfree(frozen_buffer);
> >  
> >  	JBUFFER_TRACE(jh, "exit");
> >  	return error;
> > @@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
> >  
> >  repeat:
> >  	if (!jh->b_committed_data)
> > -		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
> > +		committed_data = kmalloc(jh2bh(jh)->b_size,
> >  					    GFP_NOFS|__GFP_NOFAIL);
> >  
> >  	spin_lock(&jh->b_state_lock);
> > @@ -1445,7 +1445,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
> >  out:
> >  	jbd2_journal_put_journal_head(jh);
> >  	if (unlikely(committed_data))
> > -		jbd2_free(committed_data, bh->b_size);
> > +		kfree(committed_data);
> >  	return err;
> >  }
> >  
> > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > index 7e785aa6d35d..b68561187e90 100644
> > --- a/include/linux/jbd2.h
> > +++ b/include/linux/jbd2.h
> > @@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
> >  #define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
> >  #endif
> >  
> > -extern void *jbd2_alloc(size_t size, gfp_t flags);
> > -extern void jbd2_free(void *ptr, size_t size);
> > -
> >  #define JBD2_MIN_JOURNAL_BLOCKS 1024
> >  #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
> >  
> > -- 
> > 2.47.3
> > 
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH] jbd2: Remove special jbd2 slabs
From: Jan Kara @ 2026-05-26  7:50 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Theodore Ts'o, Jan Kara, linux-ext4, linux-fsdevel,
	Mike Rapoport (Microsoft), Vlastimil Babka
In-Reply-To: <20260525201321.21717-1-willy@infradead.org>

On Mon 25-05-26 21:13:19, Matthew Wilcox (Oracle) wrote:
> When jbd2 was originally written, kmalloc() would not guarantee alignment
> for the requested memory.  Since commit 59bb47985c1d in 2019, kmalloc
> has guaranteed natural alignment for power-of-two allocations.  We can
> now remove the jbd2 special slabs and just use kmalloc() directly.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Very nice! So this replaces Mike's patch, doesn't it? Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/jbd2/commit.c      |   8 ++-
>  fs/jbd2/journal.c     | 121 ++----------------------------------------
>  fs/jbd2/transaction.c |   8 +--
>  include/linux/jbd2.h  |   3 --
>  4 files changed, 11 insertions(+), 129 deletions(-)
> 
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 38f318bb4279..2e8dbc4547bb 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -514,10 +514,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
>  		 * leave undo-committed data.
>  		 */
>  		if (jh->b_committed_data) {
> -			struct buffer_head *bh = jh2bh(jh);
> -
>  			spin_lock(&jh->b_state_lock);
> -			jbd2_free(jh->b_committed_data, bh->b_size);
> +			kfree(jh->b_committed_data);
>  			jh->b_committed_data = NULL;
>  			spin_unlock(&jh->b_state_lock);
>  		}
> @@ -978,7 +976,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
>  		 * its triggers if they exist, so we can clear that too.
>  		 */
>  		if (jh->b_committed_data) {
> -			jbd2_free(jh->b_committed_data, bh->b_size);
> +			kfree(jh->b_committed_data);
>  			jh->b_committed_data = NULL;
>  			if (jh->b_frozen_data) {
>  				jh->b_committed_data = jh->b_frozen_data;
> @@ -986,7 +984,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
>  				jh->b_frozen_triggers = NULL;
>  			}
>  		} else if (jh->b_frozen_data) {
> -			jbd2_free(jh->b_frozen_data, bh->b_size);
> +			kfree(jh->b_frozen_data);
>  			jh->b_frozen_data = NULL;
>  			jh->b_frozen_triggers = NULL;
>  		}
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index a6616380ce38..ad10c8a92fa0 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
>  EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
>  EXPORT_SYMBOL(jbd2_inode_cache);
>  
> -static int jbd2_journal_create_slab(size_t slab_size);
> -
>  #ifdef CONFIG_JBD2_DEBUG
>  void __jbd2_debug(int level, const char *file, const char *func,
>  		  unsigned int line, const char *fmt, ...)
> @@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
>  			goto escape_done;
>  
>  		spin_unlock(&jh_in->b_state_lock);
> -		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
> +		tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
>  		spin_lock(&jh_in->b_state_lock);
>  		if (jh_in->b_frozen_data) {
> -			jbd2_free(tmp, bh_in->b_size);
> +			kfree(tmp);
>  			goto copy_done;
>  		}
>  
> @@ -2063,14 +2061,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
>  int jbd2_journal_load(journal_t *journal)
>  {
>  	int err;
> -	journal_superblock_t *sb = journal->j_superblock;
> -
> -	/*
> -	 * Create a slab for this blocksize
> -	 */
> -	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
> -	if (err)
> -		return err;
>  
>  	/* Let the recovery code check whether it needs to recover any
>  	 * data from the journal. */
> @@ -2698,108 +2688,6 @@ size_t journal_tag_bytes(journal_t *journal)
>  		return sz - sizeof(__u32);
>  }
>  
> -/*
> - * JBD memory management
> - *
> - * These functions are used to allocate block-sized chunks of memory
> - * used for making copies of buffer_head data.  Very often it will be
> - * page-sized chunks of data, but sometimes it will be in
> - * sub-page-size chunks.  (For example, 16k pages on Power systems
> - * with a 4k block file system.)  For blocks smaller than a page, we
> - * use a SLAB allocator.  There are slab caches for each block size,
> - * which are allocated at mount time, if necessary, and we only free
> - * (all of) the slab caches when/if the jbd2 module is unloaded.  For
> - * this reason we don't need to a mutex to protect access to
> - * jbd2_slab[] allocating or releasing memory; only in
> - * jbd2_journal_create_slab().
> - */
> -#define JBD2_MAX_SLABS 8
> -static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
> -
> -static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
> -	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
> -	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
> -};
> -
> -
> -static void jbd2_journal_destroy_slabs(void)
> -{
> -	int i;
> -
> -	for (i = 0; i < JBD2_MAX_SLABS; i++) {
> -		kmem_cache_destroy(jbd2_slab[i]);
> -		jbd2_slab[i] = NULL;
> -	}
> -}
> -
> -static int jbd2_journal_create_slab(size_t size)
> -{
> -	static DEFINE_MUTEX(jbd2_slab_create_mutex);
> -	int i = order_base_2(size) - 10;
> -	size_t slab_size;
> -
> -	if (size == PAGE_SIZE)
> -		return 0;
> -
> -	if (i >= JBD2_MAX_SLABS)
> -		return -EINVAL;
> -
> -	if (unlikely(i < 0))
> -		i = 0;
> -	mutex_lock(&jbd2_slab_create_mutex);
> -	if (jbd2_slab[i]) {
> -		mutex_unlock(&jbd2_slab_create_mutex);
> -		return 0;	/* Already created */
> -	}
> -
> -	slab_size = 1 << (i+10);
> -	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
> -					 slab_size, 0, NULL);
> -	mutex_unlock(&jbd2_slab_create_mutex);
> -	if (!jbd2_slab[i]) {
> -		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
> -		return -ENOMEM;
> -	}
> -	return 0;
> -}
> -
> -static struct kmem_cache *get_slab(size_t size)
> -{
> -	int i = order_base_2(size) - 10;
> -
> -	BUG_ON(i >= JBD2_MAX_SLABS);
> -	if (unlikely(i < 0))
> -		i = 0;
> -	BUG_ON(jbd2_slab[i] == NULL);
> -	return jbd2_slab[i];
> -}
> -
> -void *jbd2_alloc(size_t size, gfp_t flags)
> -{
> -	void *ptr;
> -
> -	BUG_ON(size & (size-1)); /* Must be a power of 2 */
> -
> -	if (size < PAGE_SIZE)
> -		ptr = kmem_cache_alloc(get_slab(size), flags);
> -	else
> -		ptr = (void *)__get_free_pages(flags, get_order(size));
> -
> -	/* Check alignment; SLUB has gotten this wrong in the past,
> -	 * and this can lead to user data corruption! */
> -	BUG_ON(((unsigned long) ptr) & (size-1));
> -
> -	return ptr;
> -}
> -
> -void jbd2_free(void *ptr, size_t size)
> -{
> -	if (size < PAGE_SIZE)
> -		kmem_cache_free(get_slab(size), ptr);
> -	else
> -		free_pages((unsigned long)ptr, get_order(size));
> -};
> -
>  /*
>   * Journal_head storage management
>   */
> @@ -2977,11 +2865,11 @@ static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
>  {
>  	if (jh->b_frozen_data) {
>  		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
> -		jbd2_free(jh->b_frozen_data, b_size);
> +		kfree(jh->b_frozen_data);
>  	}
>  	if (jh->b_committed_data) {
>  		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
> -		jbd2_free(jh->b_committed_data, b_size);
> +		kfree(jh->b_committed_data);
>  	}
>  	journal_free_journal_head(jh);
>  }
> @@ -3142,7 +3030,6 @@ static void jbd2_journal_destroy_caches(void)
>  	jbd2_journal_destroy_handle_cache();
>  	jbd2_journal_destroy_inode_cache();
>  	jbd2_journal_destroy_transaction_cache();
> -	jbd2_journal_destroy_slabs();
>  }
>  
>  static int __init journal_init(void)
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..48ddb566d12d 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -1131,7 +1131,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
>  		if (!frozen_buffer) {
>  			JBUFFER_TRACE(jh, "allocate memory for buffer");
>  			spin_unlock(&jh->b_state_lock);
> -			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
> +			frozen_buffer = kmalloc(jh2bh(jh)->b_size,
>  						   GFP_NOFS | __GFP_NOFAIL);
>  			goto repeat;
>  		}
> @@ -1159,7 +1159,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
>  
>  out:
>  	if (unlikely(frozen_buffer))	/* It's usually NULL */
> -		jbd2_free(frozen_buffer, bh->b_size);
> +		kfree(frozen_buffer);
>  
>  	JBUFFER_TRACE(jh, "exit");
>  	return error;
> @@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
>  
>  repeat:
>  	if (!jh->b_committed_data)
> -		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
> +		committed_data = kmalloc(jh2bh(jh)->b_size,
>  					    GFP_NOFS|__GFP_NOFAIL);
>  
>  	spin_lock(&jh->b_state_lock);
> @@ -1445,7 +1445,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
>  out:
>  	jbd2_journal_put_journal_head(jh);
>  	if (unlikely(committed_data))
> -		jbd2_free(committed_data, bh->b_size);
> +		kfree(committed_data);
>  	return err;
>  }
>  
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 7e785aa6d35d..b68561187e90 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
>  #define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
>  #endif
>  
> -extern void *jbd2_alloc(size_t size, gfp_t flags);
> -extern void jbd2_free(void *ptr, size_t size);
> -
>  #define JBD2_MIN_JOURNAL_BLOCKS 1024
>  #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
>  
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v5 03/10] fstests: add test for inotify isolation on cloned devices
From: Christoph Hellwig @ 2026-05-26  6:28 UTC (permalink / raw)
  To: Anand Jain
  Cc: Christoph Hellwig, Anand Jain, fstests, linux-btrfs, linux-ext4,
	linux-xfs, amir73il, zlang
In-Reply-To: <edadc872-5796-4cff-934b-cee66fde79d2@gmail.com>

On Mon, May 25, 2026 at 04:35:58PM +0800, Anand Jain wrote:
> > Also any reason to rely on the obsolete inotify instead of fsnotify?
> 
> fsnotify is exercised in patch 4/10.
> IMO, exercising inotify ensures we don't break legacy stuff.

fanotity and inotify use exactly the same backends, so I'm not sure
why testing both matters.  Not that I care very strongly, I'm just a
bit confused.


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox