Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* Re: [PATCH v5 03/10] fstests: add test for inotify isolation on cloned devices
From: Christoph Hellwig @ 2026-05-26  6:28 UTC (permalink / raw)
  To: Anand Jain
  Cc: Christoph Hellwig, Anand Jain, fstests, linux-btrfs, linux-ext4,
	linux-xfs, amir73il, zlang
In-Reply-To: <edadc872-5796-4cff-934b-cee66fde79d2@gmail.com>

On Mon, May 25, 2026 at 04:35:58PM +0800, Anand Jain wrote:
> > Also any reason to rely on the obsolete inotify instead of fsnotify?
> 
> fsnotify is exercised in patch 4/10.
> IMO, exercising inotify ensures we don't break legacy stuff.

fanotity and inotify use exactly the same backends, so I'm not sure
why testing both matters.  Not that I care very strongly, I'm just a
bit confused.


^ permalink raw reply

* [PATCH] jbd2: Remove special jbd2 slabs
From: Matthew Wilcox (Oracle) @ 2026-05-25 20:13 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Matthew Wilcox (Oracle), Jan Kara, linux-ext4, linux-fsdevel,
	Mike Rapoport (Microsoft), Vlastimil Babka

When jbd2 was originally written, kmalloc() would not guarantee alignment
for the requested memory.  Since commit 59bb47985c1d in 2019, kmalloc
has guaranteed natural alignment for power-of-two allocations.  We can
now remove the jbd2 special slabs and just use kmalloc() directly.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/jbd2/commit.c      |   8 ++-
 fs/jbd2/journal.c     | 121 ++----------------------------------------
 fs/jbd2/transaction.c |   8 +--
 include/linux/jbd2.h  |   3 --
 4 files changed, 11 insertions(+), 129 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 38f318bb4279..2e8dbc4547bb 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -514,10 +514,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * leave undo-committed data.
 		 */
 		if (jh->b_committed_data) {
-			struct buffer_head *bh = jh2bh(jh);
-
 			spin_lock(&jh->b_state_lock);
-			jbd2_free(jh->b_committed_data, bh->b_size);
+			kfree(jh->b_committed_data);
 			jh->b_committed_data = NULL;
 			spin_unlock(&jh->b_state_lock);
 		}
@@ -978,7 +976,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * its triggers if they exist, so we can clear that too.
 		 */
 		if (jh->b_committed_data) {
-			jbd2_free(jh->b_committed_data, bh->b_size);
+			kfree(jh->b_committed_data);
 			jh->b_committed_data = NULL;
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
@@ -986,7 +984,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				jh->b_frozen_triggers = NULL;
 			}
 		} else if (jh->b_frozen_data) {
-			jbd2_free(jh->b_frozen_data, bh->b_size);
+			kfree(jh->b_frozen_data);
 			jh->b_frozen_data = NULL;
 			jh->b_frozen_triggers = NULL;
 		}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a6616380ce38..ad10c8a92fa0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -95,8 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
-static int jbd2_journal_create_slab(size_t slab_size);
-
 #ifdef CONFIG_JBD2_DEBUG
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...)
@@ -385,10 +383,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 			goto escape_done;
 
 		spin_unlock(&jh_in->b_state_lock);
-		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
+		tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
 		spin_lock(&jh_in->b_state_lock);
 		if (jh_in->b_frozen_data) {
-			jbd2_free(tmp, bh_in->b_size);
+			kfree(tmp);
 			goto copy_done;
 		}
 
@@ -2063,14 +2061,6 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 int jbd2_journal_load(journal_t *journal)
 {
 	int err;
-	journal_superblock_t *sb = journal->j_superblock;
-
-	/*
-	 * Create a slab for this blocksize
-	 */
-	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
-	if (err)
-		return err;
 
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
@@ -2698,108 +2688,6 @@ size_t journal_tag_bytes(journal_t *journal)
 		return sz - sizeof(__u32);
 }
 
-/*
- * JBD memory management
- *
- * These functions are used to allocate block-sized chunks of memory
- * used for making copies of buffer_head data.  Very often it will be
- * page-sized chunks of data, but sometimes it will be in
- * sub-page-size chunks.  (For example, 16k pages on Power systems
- * with a 4k block file system.)  For blocks smaller than a page, we
- * use a SLAB allocator.  There are slab caches for each block size,
- * which are allocated at mount time, if necessary, and we only free
- * (all of) the slab caches when/if the jbd2 module is unloaded.  For
- * this reason we don't need to a mutex to protect access to
- * jbd2_slab[] allocating or releasing memory; only in
- * jbd2_journal_create_slab().
- */
-#define JBD2_MAX_SLABS 8
-static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-
-static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
-	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
-	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
-};
-
-
-static void jbd2_journal_destroy_slabs(void)
-{
-	int i;
-
-	for (i = 0; i < JBD2_MAX_SLABS; i++) {
-		kmem_cache_destroy(jbd2_slab[i]);
-		jbd2_slab[i] = NULL;
-	}
-}
-
-static int jbd2_journal_create_slab(size_t size)
-{
-	static DEFINE_MUTEX(jbd2_slab_create_mutex);
-	int i = order_base_2(size) - 10;
-	size_t slab_size;
-
-	if (size == PAGE_SIZE)
-		return 0;
-
-	if (i >= JBD2_MAX_SLABS)
-		return -EINVAL;
-
-	if (unlikely(i < 0))
-		i = 0;
-	mutex_lock(&jbd2_slab_create_mutex);
-	if (jbd2_slab[i]) {
-		mutex_unlock(&jbd2_slab_create_mutex);
-		return 0;	/* Already created */
-	}
-
-	slab_size = 1 << (i+10);
-	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
-					 slab_size, 0, NULL);
-	mutex_unlock(&jbd2_slab_create_mutex);
-	if (!jbd2_slab[i]) {
-		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static struct kmem_cache *get_slab(size_t size)
-{
-	int i = order_base_2(size) - 10;
-
-	BUG_ON(i >= JBD2_MAX_SLABS);
-	if (unlikely(i < 0))
-		i = 0;
-	BUG_ON(jbd2_slab[i] == NULL);
-	return jbd2_slab[i];
-}
-
-void *jbd2_alloc(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	BUG_ON(size & (size-1)); /* Must be a power of 2 */
-
-	if (size < PAGE_SIZE)
-		ptr = kmem_cache_alloc(get_slab(size), flags);
-	else
-		ptr = (void *)__get_free_pages(flags, get_order(size));
-
-	/* Check alignment; SLUB has gotten this wrong in the past,
-	 * and this can lead to user data corruption! */
-	BUG_ON(((unsigned long) ptr) & (size-1));
-
-	return ptr;
-}
-
-void jbd2_free(void *ptr, size_t size)
-{
-	if (size < PAGE_SIZE)
-		kmem_cache_free(get_slab(size), ptr);
-	else
-		free_pages((unsigned long)ptr, get_order(size));
-};
-
 /*
  * Journal_head storage management
  */
@@ -2977,11 +2865,11 @@ static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
 {
 	if (jh->b_frozen_data) {
 		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-		jbd2_free(jh->b_frozen_data, b_size);
+		kfree(jh->b_frozen_data);
 	}
 	if (jh->b_committed_data) {
 		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
-		jbd2_free(jh->b_committed_data, b_size);
+		kfree(jh->b_committed_data);
 	}
 	journal_free_journal_head(jh);
 }
@@ -3142,7 +3030,6 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_handle_cache();
 	jbd2_journal_destroy_inode_cache();
 	jbd2_journal_destroy_transaction_cache();
-	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4885903bbd10..48ddb566d12d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1131,7 +1131,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 		if (!frozen_buffer) {
 			JBUFFER_TRACE(jh, "allocate memory for buffer");
 			spin_unlock(&jh->b_state_lock);
-			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
+			frozen_buffer = kmalloc(jh2bh(jh)->b_size,
 						   GFP_NOFS | __GFP_NOFAIL);
 			goto repeat;
 		}
@@ -1159,7 +1159,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 
 out:
 	if (unlikely(frozen_buffer))	/* It's usually NULL */
-		jbd2_free(frozen_buffer, bh->b_size);
+		kfree(frozen_buffer);
 
 	JBUFFER_TRACE(jh, "exit");
 	return error;
@@ -1424,7 +1424,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 
 repeat:
 	if (!jh->b_committed_data)
-		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
+		committed_data = kmalloc(jh2bh(jh)->b_size,
 					    GFP_NOFS|__GFP_NOFAIL);
 
 	spin_lock(&jh->b_state_lock);
@@ -1445,7 +1445,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 out:
 	jbd2_journal_put_journal_head(jh);
 	if (unlikely(committed_data))
-		jbd2_free(committed_data, bh->b_size);
+		kfree(committed_data);
 	return err;
 }
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 7e785aa6d35d..b68561187e90 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -63,9 +63,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
 #define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
-extern void *jbd2_alloc(size_t size, gfp_t flags);
-extern void jbd2_free(void *ptr, size_t size);
-
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
-- 
2.47.3


^ permalink raw reply related

* [syzbot] INFO: task kworker/u8:NUM:NUM blocked in I/O wait for more than NUM seconds. (2)
From: syzbot @ 2026-05-25 18:08 UTC (permalink / raw)
  To: jack, linux-ext4, linux-kernel, syzkaller-bugs, tytso

Hello,

syzbot found the following issue on:

HEAD commit:    e7ae89a0c97c Linux 7.1-rc5
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=104f6e1f980000
kernel config:  https://syzkaller.appspot.com/x/.config?x=58acee1ac5406016
dashboard link: https://syzkaller.appspot.com/bug?extid=ce3f1479b6afc3f5d0cc
compiler:       gcc (Debian 14.2.0-19) 14.2.0, GNU ld (GNU Binutils for Debian) 2.44

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/9b0c5b4e3645/disk-e7ae89a0.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/ed163d3ad68b/vmlinux-e7ae89a0.xz
kernel image: https://storage.googleapis.com/syzbot-assets/f2408b333334/bzImage-e7ae89a0.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+ce3f1479b6afc3f5d0cc@syzkaller.appspotmail.com

INFO: task kworker/u8:6:1008 blocked in I/O wait for more than 143 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u8:6    state:D stack:20696 pid:1008  tgid:1008  ppid:2      task_flags:0x4248060 flags:0x00080000
Workqueue: writeback wb_workfn (flush-8:0)
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 do_get_write_access+0x84f/0x1220 fs/jbd2/transaction.c:1113
 jbd2_journal_get_write_access+0x1d6/0x280 fs/jbd2/transaction.c:1263
 __ext4_journal_get_write_access+0x6a/0x340 fs/ext4/ext4_jbd2.c:241
 ext4_reserve_inode_write+0x1b7/0x330 fs/ext4/inode.c:6375
 __ext4_mark_inode_dirty+0x18f/0x890 fs/ext4/inode.c:6550
 __ext4_ext_dirty+0x1b2/0x230 fs/ext4/extents.c:207
 ext4_ext_insert_extent+0x11ff/0x4540 fs/ext4/extents.c:2210
 ext4_ext_map_blocks+0x21c8/0x5930 fs/ext4/extents.c:4482
 ext4_map_create_blocks+0xec/0x5e0 fs/ext4/inode.c:631
 ext4_map_blocks+0x46b/0xd30 fs/ext4/inode.c:824
 mpage_map_one_extent fs/ext4/inode.c:2396 [inline]
 mpage_map_and_submit_extent fs/ext4/inode.c:2490 [inline]
 ext4_do_writepages+0x2313/0x3f20 fs/ext4/inode.c:2948
 ext4_writepages+0x347/0x790 fs/ext4/inode.c:3042
 do_writepages+0x278/0x600 mm/page-writeback.c:2571
 __writeback_single_inode+0x164/0x1350 fs/fs-writeback.c:1764
 writeback_sb_inodes+0x766/0x1c60 fs/fs-writeback.c:2056
 __writeback_inodes_wb+0xf8/0x2d0 fs/fs-writeback.c:2132
 wb_writeback+0x720/0xb90 fs/fs-writeback.c:2243
 wb_check_old_data_flush fs/fs-writeback.c:2347 [inline]
 wb_do_writeback fs/fs-writeback.c:2400 [inline]
 wb_workfn+0x8dd/0xc00 fs/fs-writeback.c:2428
 process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314
 process_scheduled_works kernel/workqueue.c:3397 [inline]
 worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
INFO: task jbd2/sda1-8:4956 blocked in I/O wait for more than 144 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:jbd2/sda1-8     state:D stack:27000 pid:4956  tgid:4956  ppid:2      task_flags:0x240040 flags:0x00080000
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 __wait_on_buffer+0x64/0x70 fs/buffer.c:123
 wait_on_buffer include/linux/buffer_head.h:420 [inline]
 jbd2_journal_commit_transaction+0x388a/0x6870 fs/jbd2/commit.c:837
 kjournald2+0x200/0x760 fs/jbd2/journal.c:201
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
INFO: task syz-executor:5615 blocked in I/O wait for more than 144 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor    state:D stack:22480 pid:5615  tgid:5615  ppid:5614   task_flags:0x440100 flags:0x00080000
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 do_get_write_access+0x84f/0x1220 fs/jbd2/transaction.c:1113
 jbd2_journal_get_write_access+0x1d6/0x280 fs/jbd2/transaction.c:1263
 __ext4_journal_get_write_access+0x6a/0x340 fs/ext4/ext4_jbd2.c:241
 ext4_reserve_inode_write+0x1b7/0x330 fs/ext4/inode.c:6375
 __ext4_mark_inode_dirty+0x18f/0x890 fs/ext4/inode.c:6550
 ext4_dirty_inode+0xd9/0x130 fs/ext4/inode.c:6587
 __mark_inode_dirty+0x1f3/0x1720 fs/fs-writeback.c:2623
 generic_update_time fs/inode.c:2198 [inline]
 file_update_time_flags+0x46b/0x500 fs/inode.c:2428
 ext4_page_mkwrite+0x324/0x1890 fs/ext4/inode.c:6753
 do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 do_shared_fault mm/memory.c:5985 [inline]
 do_fault+0x3b5/0x1750 mm/memory.c:6047
 do_pte_missing mm/memory.c:4566 [inline]
 handle_pte_fault mm/memory.c:6427 [inline]
 __handle_mm_fault+0x187d/0x2a00 mm/memory.c:6565
 handle_mm_fault+0x36d/0xa20 mm/memory.c:6734
 do_user_addr_fault+0x5a3/0x12f0 arch/x86/mm/fault.c:1334
 handle_page_fault arch/x86/mm/fault.c:1474 [inline]
 exc_page_fault+0x6f/0xd0 arch/x86/mm/fault.c:1527
 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:618
RIP: 0033:0x7f7c33c7b467
RSP: 002b:00007ffd3e90b380 EFLAGS: 00010202
RAX: 00007f7c31f0c000 RBX: 00005555940107b0 RCX: 0000000000000006
RDX: 0000000000000ed3 RSI: 000055559400d0d8 RDI: 0000000000000040
RBP: 00007ffd3e90b880 R08: 00000000000000c4 R09: 000055559400d0d8
R10: 00007f7c33c00000 R11: 0000000000000202 R12: 00007ffd3e90b6c0
R13: 00007ffd3e90b4b0 R14: 585858582e7a7973 R15: 00007ffd3e90b400
 </TASK>
INFO: task syz.3.261:7085 blocked in I/O wait for more than 144 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.3.261       state:D stack:27128 pid:7085  tgid:7085  ppid:5630   task_flags:0x440040 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 do_get_write_access+0x84f/0x1220 fs/jbd2/transaction.c:1113
 jbd2_journal_get_write_access+0x1d6/0x280 fs/jbd2/transaction.c:1263
 __ext4_journal_get_write_access+0x6a/0x340 fs/ext4/ext4_jbd2.c:241
 ext4_reserve_inode_write+0x1b7/0x330 fs/ext4/inode.c:6375
 __ext4_mark_inode_dirty+0x18f/0x890 fs/ext4/inode.c:6550
 ext4_dirty_inode+0xd9/0x130 fs/ext4/inode.c:6587
 __mark_inode_dirty+0x1f3/0x1720 fs/fs-writeback.c:2623
 generic_update_time fs/inode.c:2198 [inline]
 file_update_time_flags+0x46b/0x500 fs/inode.c:2428
 ext4_page_mkwrite+0x324/0x1890 fs/ext4/inode.c:6753
 do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 do_shared_fault mm/memory.c:5985 [inline]
 do_fault+0x3b5/0x1750 mm/memory.c:6047
 do_pte_missing mm/memory.c:4566 [inline]
 handle_pte_fault mm/memory.c:6427 [inline]
 __handle_mm_fault+0x187d/0x2a00 mm/memory.c:6565
 handle_mm_fault+0x36d/0xa20 mm/memory.c:6734
 do_user_addr_fault+0x5a3/0x12f0 arch/x86/mm/fault.c:1334
 handle_page_fault arch/x86/mm/fault.c:1474 [inline]
 exc_page_fault+0x6f/0xd0 arch/x86/mm/fault.c:1527
 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:618
RIP: 0033:0x7f5e25071580
RSP: 002b:00007ffdb95adcf0 EFLAGS: 00010202
RAX: 0000001b34904000 RBX: ffffffff89ed2dd7 RCX: 0000001b34903ff8
RDX: 0000001b34524220 RSI: 0000000000000008 RDI: 00007f5e25f45720
RBP: 0000000000000078 R08: 00007f5e25400000 R09: 00007f5e25402000
R10: 0000000089ed2ddb R11: 0000000000000017 R12: 00007f5e25416128
R13: 0000000000000077 R14: ffffffff89ed2dd7 R15: 00007f5e25f45720
 </TASK>
INFO: task syz.2.260:7087 blocked in I/O wait for more than 145 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.2.260       state:D stack:27128 pid:7087  tgid:7087  ppid:5637   task_flags:0x440040 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 do_get_write_access+0x84f/0x1220 fs/jbd2/transaction.c:1113
 jbd2_journal_get_write_access+0x1d6/0x280 fs/jbd2/transaction.c:1263
 __ext4_journal_get_write_access+0x6a/0x340 fs/ext4/ext4_jbd2.c:241
 ext4_reserve_inode_write+0x1b7/0x330 fs/ext4/inode.c:6375
 __ext4_mark_inode_dirty+0x18f/0x890 fs/ext4/inode.c:6550
 ext4_dirty_inode+0xd9/0x130 fs/ext4/inode.c:6587
 __mark_inode_dirty+0x1f3/0x1720 fs/fs-writeback.c:2623
 generic_update_time fs/inode.c:2198 [inline]
 file_update_time_flags+0x46b/0x500 fs/inode.c:2428
 ext4_page_mkwrite+0x324/0x1890 fs/ext4/inode.c:6753
 do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 do_shared_fault mm/memory.c:5985 [inline]
 do_fault+0x3b5/0x1750 mm/memory.c:6047
 do_pte_missing mm/memory.c:4566 [inline]
 handle_pte_fault mm/memory.c:6427 [inline]
 __handle_mm_fault+0x187d/0x2a00 mm/memory.c:6565
 handle_mm_fault+0x36d/0xa20 mm/memory.c:6734
 do_user_addr_fault+0x5a3/0x12f0 arch/x86/mm/fault.c:1334
 handle_page_fault arch/x86/mm/fault.c:1474 [inline]
 exc_page_fault+0x6f/0xd0 arch/x86/mm/fault.c:1527
 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:618
RIP: 0033:0x7f095de65711
RSP: 002b:00007ffcba281b70 EFLAGS: 00010202
RAX: 0000001b34464000 RBX: 0000000000000000 RCX: 000000000003fde8
RDX: 0000001b34463fff RSI: 0000000000000008 RDI: 00007f095ed45720
RBP: 00007f095ed45720 R08: 0000000000000000 R09: 00007f095e216218
R10: 00007f095ed45700 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000003
 </TASK>
INFO: task syz.0.262:7095 blocked in I/O wait for more than 145 seconds.
      Tainted: G             L      syzkaller #0
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz.0.262       state:D stack:27384 pid:7095  tgid:7095  ppid:5638   task_flags:0x440040 flags:0x00080002
Call Trace:
 <TASK>
 context_switch kernel/sched/core.c:5388 [inline]
 __schedule+0x1295/0x67a0 kernel/sched/core.c:7189
 __schedule_loop kernel/sched/core.c:7268 [inline]
 schedule+0xdd/0x390 kernel/sched/core.c:7283
 io_schedule+0x8a/0xf0 kernel/sched/core.c:8110
 bit_wait_io+0xd/0xe0 kernel/sched/wait_bit.c:250
 __wait_on_bit+0x65/0x180 kernel/sched/wait_bit.c:52
 out_of_line_wait_on_bit+0xdc/0x110 kernel/sched/wait_bit.c:67
 wait_on_bit_io include/linux/wait_bit.h:105 [inline]
 do_get_write_access+0x84f/0x1220 fs/jbd2/transaction.c:1113
 jbd2_journal_get_write_access+0x1d6/0x280 fs/jbd2/transaction.c:1263
 __ext4_journal_get_write_access+0x6a/0x340 fs/ext4/ext4_jbd2.c:241
 ext4_reserve_inode_write+0x1b7/0x330 fs/ext4/inode.c:6375
 __ext4_mark_inode_dirty+0x18f/0x890 fs/ext4/inode.c:6550
 ext4_dirty_inode+0xd9/0x130 fs/ext4/inode.c:6587
 __mark_inode_dirty+0x1f3/0x1720 fs/fs-writeback.c:2623
 generic_update_time fs/inode.c:2198 [inline]
 touch_atime+0x642/0x7a0 fs/inode.c:2273
 file_accessed include/linux/fs.h:2264 [inline]
 ext4_file_mmap_prepare+0x56d/0x670 fs/ext4/file.c:840
 vfs_mmap_prepare include/linux/fs.h:2076 [inline]
 call_mmap_prepare mm/vma.c:2672 [inline]
 __mmap_region+0xe98/0x2da0 mm/vma.c:2755
 mmap_region+0x527/0x620 mm/vma.c:2857
 do_mmap+0xc63/0x12f0 mm/mmap.c:560
 vm_mmap_pgoff+0x29e/0x470 mm/util.c:581
 ksys_mmap_pgoff+0x3cb/0x610 mm/mmap.c:606
 __do_sys_mmap arch/x86/kernel/sys_x86_64.c:89 [inline]
 __se_sys_mmap arch/x86/kernel/sys_x86_64.c:82 [inline]
 __x64_sys_mmap+0x125/0x190 arch/x86/kernel/sys_x86_64.c:82
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x10b/0x830 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f94ab39cbc2
RSP: 002b:00007ffda7897338 EFLAGS: 00000206 ORIG_RAX: 0000000000000009
RAX: ffffffffffffffda RBX: 0000001b34264000 RCX: 00007f94ab39cbc2
RDX: 0000000000000003 RSI: 00000000003c0000 RDI: 0000001b34264000
RBP: 0000000000100001 R08: 0000000000000004 R09: 0000000000040000
R10: 0000000000100001 R11: 0000000000000206 R12: 000000000000003f
R13: 00000000000927c0 R14: 00000000000215f4 R15: 00007ffda7897610
 </TASK>

Showing all locks held in the system:
1 lock held by khungtaskd/30:
 #0: ffffffff8e7e5420 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline]
 #0: ffffffff8e7e5420 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
 #0: ffffffff8e7e5420 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x3d/0x184 kernel/locking/lockdep.c:6775
6 locks held by kworker/u8:6/1008:
 #0: ffff888020af0940 ((wq_completion)writeback){+.+.}-{0:0}, at: process_one_work+0x12d6/0x1980 kernel/workqueue.c:3289
 #1: ffffc90005407d08 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: process_one_work+0x973/0x1980 kernel/workqueue.c:3290
 #2: ffff88802be4a0d8 (&type->s_umount_key#33){++++}-{4:4}, at: super_trylock_shared+0x1e/0xf0 fs/super.c:565
 #3: ffff88802be48c18 (&sbi->s_writepages_rwsem){++++}-{0:0}, at: do_writepages+0x278/0x600 mm/page-writeback.c:2571
 #4: ffff88802bf7e938 (jbd2_handle){++++}-{0:0}, at: start_this_handle+0xfaa/0x13a0 fs/jbd2/transaction.c:444
 #5: ffff888076429e30 (&ei->i_data_sem){++++}-{4:4}, at: ext4_map_blocks+0x45a/0xd30 fs/ext4/inode.c:823
2 locks held by getty/5386:
 #0: ffff8880334de0a0 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x24/0x80 drivers/tty/tty_ldisc.c:243
 #1: ffffc900032332e8 (&ldata->atomic_read_lock){+.+.}-{4:4}, at: n_tty_read+0x419/0x14f0 drivers/tty/n_tty.c:2211
3 locks held by syz-executor/5615:
 #0: ffff888034479448 (vm_lock){++++}-{0:0}, at: lock_vma_under_rcu+0x11d/0x590 mm/mmap_lock.c:310
 #1: ffff88802be4a508 (sb_pagefaults){.+.+}-{0:0}, at: do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 #2: ffff88802bf7e938 (jbd2_handle){++++}-{0:0}, at: start_this_handle+0xfaa/0x13a0 fs/jbd2/transaction.c:444
3 locks held by syz.3.261/7085:
 #0: ffff88807f470e48 (vm_lock){++++}-{0:0}, at: lock_vma_under_rcu+0x11d/0x590 mm/mmap_lock.c:310
 #1: ffff88802be4a508 (sb_pagefaults){.+.+}-{0:0}, at: do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 #2: ffff88802bf7e938 (jbd2_handle){++++}-{0:0}, at: start_this_handle+0xfaa/0x13a0 fs/jbd2/transaction.c:444
3 locks held by syz.2.260/7087:
 #0: ffff888079bb7588 (vm_lock){++++}-{0:0}, at: lock_vma_under_rcu+0x11d/0x590 mm/mmap_lock.c:310
 #1: ffff88802be4a508 (sb_pagefaults){.+.+}-{0:0}, at: do_page_mkwrite+0x17a/0x440 mm/memory.c:3684
 #2: ffff88802bf7e938 (jbd2_handle){++++}-{0:0}, at: start_this_handle+0xfaa/0x13a0 fs/jbd2/transaction.c:444
3 locks held by syz.0.262/7095:
 #0: ffff888036958f78 (&mm->mmap_lock){++++}-{4:4}, at: mmap_write_lock_killable include/linux/mmap_lock.h:554 [inline]
 #0: ffff888036958f78 (&mm->mmap_lock){++++}-{4:4}, at: vm_mmap_pgoff+0x1f5/0x470 mm/util.c:579
 #1: ffff88802be4a410 (sb_writers#4){.+.+}-{0:0}, at: file_accessed include/linux/fs.h:2264 [inline]
 #1: ffff88802be4a410 (sb_writers#4){.+.+}-{0:0}, at: ext4_file_mmap_prepare+0x56d/0x670 fs/ext4/file.c:840
 #2: ffff88802bf7e938 (jbd2_handle){++++}-{0:0}, at: start_this_handle+0xfaa/0x13a0 fs/jbd2/transaction.c:444

=============================================

NMI backtrace for cpu 0
CPU: 0 UID: 0 PID: 30 Comm: khungtaskd Tainted: G             L      syzkaller #0 PREEMPT(full) 
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:94 [inline]
 dump_stack_lvl+0x100/0x190 lib/dump_stack.c:120
 nmi_cpu_backtrace.cold+0x12d/0x151 lib/nmi_backtrace.c:113
 nmi_trigger_cpumask_backtrace+0x1d7/0x230 lib/nmi_backtrace.c:62
 trigger_all_cpu_backtrace include/linux/nmi.h:162 [inline]
 __sys_info lib/sys_info.c:157 [inline]
 sys_info+0x141/0x190 lib/sys_info.c:165
 check_hung_uninterruptible_tasks kernel/hung_task.c:353 [inline]
 watchdog+0xcb1/0x1030 kernel/hung_task.c:561
 kthread+0x370/0x450 kernel/kthread.c:436
 ret_from_fork+0x72b/0xd50 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH 10/17] jbd2: replace __get_free_pages() with kmalloc()
From: Matthew Wilcox @ 2026-05-25 17:55 UTC (permalink / raw)
  To: David Laight
  Cc: Jan Kara, Mike Rapoport (Microsoft), Jan Kara, Mark Fasheh,
	Joel Becker, Joseph Qi, Ryusuke Konishi, Viacheslav Dubeyko,
	Trond Myklebust, Anna Schumaker, Chuck Lever, Jeff Layton,
	NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey, Alexander Viro,
	Christian Brauner, Dave Kleikamp, Theodore Ts'o,
	Miklos Szeredi, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian, linux-kernel, linux-fsdevel, ocfs2-devel,
	linux-nilfs, linux-nfs, jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <20260525182134.04045610@pumpkin>

On Mon, May 25, 2026 at 06:21:34PM +0100, David Laight wrote:
> Would kvalloc() be more appropriate here?

no

> Does __get_free_pages() return physically contiguous memory?

yes

^ permalink raw reply

* Re: [PATCH 10/17] jbd2: replace __get_free_pages() with kmalloc()
From: David Laight @ 2026-05-25 17:21 UTC (permalink / raw)
  To: Jan Kara
  Cc: Mike Rapoport (Microsoft), Jan Kara, Mark Fasheh, Joel Becker,
	Joseph Qi, Ryusuke Konishi, Viacheslav Dubeyko, Trond Myklebust,
	Anna Schumaker, Chuck Lever, Jeff Layton, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Alexander Viro,
	Christian Brauner, Dave Kleikamp, Theodore Ts'o,
	Miklos Szeredi, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian, linux-kernel, linux-fsdevel, ocfs2-devel,
	linux-nilfs, linux-nfs, jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <2omm5gmnv2khshoxkrag5rusd3qzrsqyjgsef2syxgryrtg6vq@ao7oabqwebgo>

On Mon, 25 May 2026 18:17:04 +0200
Jan Kara <jack@suse.cz> wrote:

> On Sat 23-05-26 20:54:22, Mike Rapoport (Microsoft) wrote:
> > jbd2_alloc() falls back from kmem_cache_alloc() to __get_free_pages() for
> > allocations larger than PAGE_SIZE.
> > But kmalloc() can handle such cases with essentially the same fallback.
> > 
> > Replace use of __get_free_pages() with kmalloc() and simplify
> > jbd2_free() as both kmem_cache_alloc() and kmalloc() allocations can be
> > freed with kfree().
> > 
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>  
> 
> Looks good. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> I'll just note that we allocate here fs block size large buffer so the same
> kind of allocator as we use for folios would be even better. But that's a
> different cleanup I guess.

Would kvalloc() be more appropriate here?
Does __get_free_pages() return physically contiguous memory?

-- David

> 
> 								Honza
> 
> > ---
> >  fs/jbd2/journal.c | 7 ++-----
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index 4f397fcdb13c..1137b471e490 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -2784,7 +2784,7 @@ void *jbd2_alloc(size_t size, gfp_t flags)
> >  	if (size < PAGE_SIZE)
> >  		ptr = kmem_cache_alloc(get_slab(size), flags);
> >  	else
> > -		ptr = (void *)__get_free_pages(flags, get_order(size));
> > +		ptr = kmalloc(size, flags);
> >  
> >  	/* Check alignment; SLUB has gotten this wrong in the past,
> >  	 * and this can lead to user data corruption! */
> > @@ -2795,10 +2795,7 @@ void *jbd2_alloc(size_t size, gfp_t flags)
> >  
> >  void jbd2_free(void *ptr, size_t size)
> >  {
> > -	if (size < PAGE_SIZE)
> > -		kmem_cache_free(get_slab(size), ptr);
> > -	else
> > -		free_pages((unsigned long)ptr, get_order(size));
> > +	kfree(ptr);
> >  };
> >  
> >  /*
> > 
> > -- 
> > 2.53.0
> >   


^ permalink raw reply

* [PATCH 17/34] jbd2: Convert jbd2_write_superblock() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by using bh_submit() instead of
submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/jbd2/journal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4f397fcdb13c..a6616380ce38 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1821,8 +1821,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
 	if (jbd2_journal_has_csum_v2or3(journal))
 		sb->s_checksum = jbd2_superblock_csum(sb);
 	get_bh(bh);
-	bh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE | write_flags, bh);
+	bh_submit(bh, REQ_OP_WRITE | write_flags, bh_end_write);
 	wait_on_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		clear_buffer_write_io_error(bh);
-- 
2.47.3


^ permalink raw reply related

* [PATCH 16/34] jbd2: Convert journal commit to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by using bh_submit()
instead of submit_bh() in journal_submit_commit_record()
and jbd2_journal_commit_transaction().  These both use
journal_end_buffer_io_sync(), so it's more straightforward to do them
both at once.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/jbd2/commit.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8cf61e7185c4..38f318bb4279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -29,8 +29,10 @@
 /*
  * IO end handler for temporary buffer_heads handling writes to the journal.
  */
-static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void journal_end_buffer_io_sync(struct bio *bio)
 {
+	bool uptodate = bio->bi_status == BLK_STS_OK;
+	struct buffer_head *bh = bio_endio_bh(bio);
 	struct buffer_head *orig_bh = bh->b_private;
 
 	BUFFER_TRACE(bh, "");
@@ -147,13 +149,12 @@ static int journal_submit_commit_record(journal_t *journal,
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
-	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
 		write_flags |= REQ_PREFLUSH | REQ_FUA;
 
-	submit_bh(write_flags, bh);
+	bh_submit(bh, write_flags, journal_end_buffer_io_sync);
 	*cbh = bh;
 	return 0;
 }
@@ -751,9 +752,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				lock_buffer(bh);
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
-				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
-					  bh);
+				bh_submit(bh,
+					REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+					journal_end_buffer_io_sync);
 			}
 			cond_resched();
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH 13/34] ext4: Convert ext4_fc_submit_bh() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by converting
ext4_end_buffer_io_sync() from bh_end_io_t to bio_end_io_t and
calling bh_submit().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/fast_commit.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b3c22636251d..d52c64adf416 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -184,8 +184,11 @@
 #include <trace/events/ext4.h>
 static struct kmem_cache *ext4_fc_dentry_cachep;
 
-static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void ext4_end_buffer_io_sync(struct bio *bio)
 {
+	bool uptodate = bio->bi_status == BLK_STS_OK;
+	struct buffer_head *bh = bio_endio_bh(bio);
+
 	BUFFER_TRACE(bh, "");
 	if (uptodate) {
 		ext4_debug("%s: Block %lld up-to-date",
@@ -659,8 +662,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 	lock_buffer(bh);
 	set_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
-	bh->b_end_io = ext4_end_buffer_io_sync;
-	submit_bh(REQ_OP_WRITE | write_flags, bh);
+	bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync);
 	EXT4_SB(sb)->s_fc_bh = NULL;
 }
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH 15/34] ext4: Convert ext4_commit_super() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by using bh_submit() instead of
submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fbe175951e01..905d66cbe3f2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6320,9 +6320,8 @@ static int ext4_commit_super(struct super_block *sb)
 	get_bh(sbh);
 	/* Clear potential dirty bit if it was journalled update */
 	clear_buffer_dirty(sbh);
-	sbh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE | REQ_SYNC |
-		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+	bh_submit(sbh, REQ_OP_WRITE | REQ_SYNC |
+		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), bh_end_write);
 	wait_on_buffer(sbh);
 	if (buffer_write_io_error(sbh)) {
 		ext4_msg(sb, KERN_ERR, "I/O error while writing "
-- 
2.47.3


^ permalink raw reply related

* [PATCH 14/34] ext4: Convert write_mmp_block_thawed() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by using bh_submit() instead of
submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/mmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6f57c181ff77..493528fbed75 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -46,9 +46,9 @@ static int write_mmp_block_thawed(struct super_block *sb,
 
 	ext4_mmp_csum_set(sb, mmp);
 	lock_buffer(bh);
-	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
+	bh_submit(bh, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+			bh_end_write);
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh)))
 		return -EIO;
-- 
2.47.3


^ permalink raw reply related

* [PATCH 12/34] ext4; Convert __ext4_read_bh() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-25 17:19 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260525171931.4144395-1-willy@infradead.org>

Avoid an extra indirect function call by converting
ext4_end_bitmap_read() from bh_end_io_t to bio_end_io_t and
calling bh_submit().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/ext4.h   | 10 +++++-----
 fs/ext4/ialloc.c |  5 ++++-
 fs/ext4/super.c  | 11 ++++++-----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..6af11f0ff1c5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2959,7 +2959,7 @@ extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
 				 ext4_group_t group, int barrier);
-extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+void ext4_end_bitmap_read(struct bio *bio);
 
 /* fast_commit.c */
 int ext4_fc_info_show(struct seq_file *seq, void *v);
@@ -3184,10 +3184,10 @@ extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 						   sector_t block);
 extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
 						sector_t block);
-extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
-				bh_end_io_t *end_io, bool simu_fail);
-extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
-			bh_end_io_t *end_io, bool simu_fail);
+void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
+		bio_end_io_t end_io, bool simu_fail);
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+		bio_end_io_t end_io, bool simu_fail);
 extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
 extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
 extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fd8f0099852..2db68b1bf855 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -66,8 +66,11 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
 }
 
-void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+void ext4_end_bitmap_read(struct bio *bio)
 {
+	bool uptodate = bio->bi_status == BLK_STS_OK;
+	struct buffer_head *bh = bio_endio_bh(bio);
+
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 		set_bitmap_uptodate(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..fbe175951e01 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -161,7 +161,7 @@ MODULE_ALIAS("ext3");
 
 
 static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
-				  bh_end_io_t *end_io, bool simu_fail)
+				  bio_end_io_t end_io, bool simu_fail)
 {
 	if (simu_fail) {
 		clear_buffer_uptodate(bh);
@@ -176,13 +176,14 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
 	 */
 	clear_buffer_verified(bh);
 
-	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
+	if (!end_io)
+		end_io = bh_end_read;
 	get_bh(bh);
-	submit_bh(REQ_OP_READ | op_flags, bh);
+	bh_submit(bh, REQ_OP_READ | op_flags, end_io);
 }
 
 void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
-			 bh_end_io_t *end_io, bool simu_fail)
+			 bio_end_io_t end_io, bool simu_fail)
 {
 	BUG_ON(!buffer_locked(bh));
 
@@ -194,7 +195,7 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
 }
 
 int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
-		 bh_end_io_t *end_io, bool simu_fail)
+		 bio_end_io_t end_io, bool simu_fail)
 {
 	BUG_ON(!buffer_locked(bh));
 
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH 04/17] nilfs2: replace get_zeroed_page() with kzalloc()
From: Viacheslav Dubeyko @ 2026-05-25 17:07 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft), Jan Kara, Mark Fasheh, Joel Becker,
	Joseph Qi, Ryusuke Konishi, Viacheslav Dubeyko, Trond Myklebust,
	Anna Schumaker, Chuck Lever, Jeff Layton, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Alexander Viro,
	Christian Brauner, Jan Kara, Dave Kleikamp, Theodore Ts'o,
	Miklos Szeredi, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian
  Cc: linux-kernel, linux-fsdevel, ocfs2-devel, linux-nilfs, linux-nfs,
	jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <20260523-b4-fs-v1-4-275e36a83f0e@kernel.org>

On Sat, 2026-05-23 at 20:54 +0300, Mike Rapoport (Microsoft) wrote:
> nilfs_ioctl_wrap_copy() allocates a temporary buffer with
> get_zeroed_page().
> 
> kzalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of get_zeroed_page() with kzalloc().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  fs/nilfs2/ioctl.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
> index e0a606643e87..b73f2c5d10f0 100644
> --- a/fs/nilfs2/ioctl.c
> +++ b/fs/nilfs2/ioctl.c
> @@ -69,7 +69,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
>  	if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
>  		return -EINVAL;
>  
> -	buf = (void *)get_zeroed_page(GFP_NOFS);
> +	buf = kzalloc(PAGE_SIZE, GFP_NOFS);
>  	if (unlikely(!buf))
>  		return -ENOMEM;
>  	maxmembs = PAGE_SIZE / argv->v_size;
> @@ -107,7 +107,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
>  	}
>  	argv->v_nmembs = total;
>  
> -	free_pages((unsigned long)buf, 0);
> +	kfree(buf);
>  	return ret;
>  }
>  

Makes sense to me.

Reviewed-by: Viacheslav Dubeyko <slava@dubeyko.com>

Thanks,
Slava.


^ permalink raw reply

* Re: [PATCH v2] ext2: Remove deprecated DAX support
From: Jan Kara @ 2026-05-25 16:30 UTC (permalink / raw)
  To: Ashwin Gundarapu; +Cc: jack, linux-ext4, linux-kernel
In-Reply-To: <19e587e5143.434d5026117452.233431645221906532@zohomail.in>

On Sun 24-05-26 11:08:53, Ashwin Gundarapu wrote:
> 
> DAX support in ext2 was deprecated in commit d5a2693f93e4
> ("ext2: Deprecate DAX") with a removal deadline of end of 2025.
> Remove all DAX code from ext2 as scheduled.
> 
> This removes the DAX mount option, IOMAP DAX support, DAX file
> operations, DAX address_space_operations, and the DAX fault handler.
> 
> Signed-off-by: Ashwin Gundarapu <linuxuser509@zohomail.in>
> ---
> v2: Removed unused sbi variable and fixed indentation as reported
>     by kernel test robot.

Thanks for the patch. Some style nits below.

>  static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
> -#ifdef CONFIG_FS_DAX
> -	if (IS_DAX(iocb->ki_filp->f_mapping->host))
> -		return ext2_dax_read_iter(iocb, to);
> -#endif
> +

Stray empty line here.

>  	if (iocb->ki_flags & IOCB_DIRECT)
>  		return ext2_dio_read_iter(iocb, to);
>  
> @@ -297,10 +188,7 @@ static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  
>  static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
> -#ifdef CONFIG_FS_DAX
> -	if (IS_DAX(iocb->ki_filp->f_mapping->host))
> -		return ext2_dax_write_iter(iocb, from);
> -#endif
> +

... and here.

>  	if (iocb->ki_flags & IOCB_DIRECT)
>  		return ext2_dio_write_iter(iocb, from);
>  
> @@ -321,7 +209,7 @@ const struct file_operations ext2_file_operations = {
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl	= ext2_compat_ioctl,
>  #endif
> -	.mmap_prepare	= ext2_file_mmap_prepare,
> +	.mmap_prepare = generic_file_mmap_prepare,

Please indent this with tab the same way as other methods.

> @@ -841,10 +818,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  
>  	iomap->flags = 0;
>  	iomap->offset = (u64)first_block << blkbits;
> -	if (flags & IOMAP_DAX)
> -		iomap->dax_dev = sbi->s_daxdev;
> -	else
> -		iomap->bdev = inode->i_sb->s_bdev;
> +        iomap->bdev = inode->i_sb->s_bdev;

Indented with spaces instead of tabs.

> @@ -1290,12 +1248,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
>  
>  	inode_dio_wait(inode);
>  
> -	if (IS_DAX(inode))
> -		error = dax_truncate_page(inode, newsize, NULL,
> -					  &ext2_iomap_ops);
> -	else
> -		error = block_truncate_page(inode->i_mapping,
> -				newsize, ext2_get_block);
> +        error = block_truncate_page(inode->i_mapping,
> +                                newsize, ext2_get_block);

Indented with spaces instead of tabs.

>  	if (error)
>  		return error;
>  

...
> +        case Opt_xip:
> +                ext2_msg_fc(fc, KERN_ERR, "DAX support has been removed. Please use ext4 instead.");
> +                return -EINVAL;

Indented with spaces instead of tabs.

> @@ -992,16 +974,8 @@ static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
>  	}
>  	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
>  
> -	if (test_opt(sb, DAX)) {
> -		if (!sbi->s_daxdev) {
> -			ext2_msg(sb, KERN_ERR,
> -				"DAX unsupported by block device. Turning off DAX.");
> -			clear_opt(sbi->s_mount_opt, DAX);
> -		} else if (blocksize != PAGE_SIZE) {
> -			ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
> -			clear_opt(sbi->s_mount_opt, DAX);
> -		}
> -	}
> +
> +

Stray empty lines.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 15/17] configfs: replace __get_free_pages() with kzalloc()
From: Jan Kara @ 2026-05-25 16:22 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-15-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:27, Mike Rapoport (Microsoft) wrote:
> configfs allocates staging buffers __get_free_pages().
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_pages() with kzalloc().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/configfs/file.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/configfs/file.c b/fs/configfs/file.c
> index ef8c3cd10cc6..a48cece775a3 100644
> --- a/fs/configfs/file.c
> +++ b/fs/configfs/file.c
> @@ -59,7 +59,7 @@ static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
>  	ssize_t count = -ENOENT;
>  
>  	if (!buffer->page)
> -		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
> +		buffer->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!buffer->page)
>  		return -ENOMEM;
>  
> @@ -184,7 +184,7 @@ static int fill_write_buffer(struct configfs_buffer *buffer,
>  	int copied;
>  
>  	if (!buffer->page)
> -		buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
> +		buffer->page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!buffer->page)
>  		return -ENOMEM;
>  
> @@ -381,8 +381,7 @@ static int configfs_release(struct inode *inode, struct file *filp)
>  	struct configfs_buffer *buffer = filp->private_data;
>  
>  	module_put(buffer->owner);
> -	if (buffer->page)
> -		free_page((unsigned long)buffer->page);
> +	kfree(buffer->page);
>  	mutex_destroy(&buffer->mutex);
>  	kfree(buffer);
>  	return 0;
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 14/17] fs/namespace: use __getname() to allocate mntpath buffer
From: Jan Kara @ 2026-05-25 16:22 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-14-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:26, Mike Rapoport (Microsoft) wrote:
> mnt_warn_timestamp_expiry() allocates memory for a path with
> __get_free_page() although there is a dedicated helper for allocation of
> file paths: __getname().
> 
> Replace __get_free_page() for allocation of a path buffer with __getname().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  fs/namespace.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index fe919abd2f01..2ed9cd846a81 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3303,7 +3303,7 @@ static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
>  	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
>  		char *buf, *mntpath;
>  
> -		buf = (char *)__get_free_page(GFP_KERNEL);
> +		buf = __getname();

Fair but d_path() below should then get PATH_MAX and not PAGE_SIZE.

>  		if (buf)
>  			mntpath = d_path(mountpoint, buf, PAGE_SIZE);
>  		else
> @@ -3319,7 +3319,7 @@ static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
>  
>  		sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
>  		if (buf)
> -			free_page((unsigned long)buf);
> +			__putname(buf);

And __putname() is fine with NULL so no need for the if (buf) check here.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 13/17] fs/select: replace __get_free_page() with kmalloc()
From: Jan Kara @ 2026-05-25 16:19 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-13-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:25, Mike Rapoport (Microsoft) wrote:
> poll_get_entry() allocates new memory for poll_table entries using
> __get_free_page().
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_page() with kmalloc().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/select.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/select.c b/fs/select.c
> index 75978b18f48f..6fa63e48cdee 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -150,7 +150,7 @@ void poll_freewait(struct poll_wqueues *pwq)
>  		} while (entry > p->entries);
>  		old = p;
>  		p = p->next;
> -		free_page((unsigned long) old);
> +		kfree(old);
>  	}
>  }
>  EXPORT_SYMBOL(poll_freewait);
> @@ -165,7 +165,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
>  	if (!table || POLL_TABLE_FULL(table)) {
>  		struct poll_table_page *new_table;
>  
> -		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
> +		new_table = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  		if (!new_table) {
>  			p->error = -ENOMEM;
>  			return NULL;
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 11/17] isofs: replace __get_free_page() with kmalloc()
From: Jan Kara @ 2026-05-25 16:17 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-11-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:23, Mike Rapoport (Microsoft) wrote:
> isofs_readdir() allocates a temporary buffer with __get_free_page().
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_page() with kmalloc().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Thanks. Added to my tree.

								Honza

> ---
>  fs/isofs/dir.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
> index 2fd9948d606e..6d220eab531e 100644
> --- a/fs/isofs/dir.c
> +++ b/fs/isofs/dir.c
> @@ -13,6 +13,7 @@
>   */
>  #include <linux/gfp.h>
>  #include <linux/filelock.h>
> +#include <linux/slab.h>
>  #include "isofs.h"
>  
>  int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
> @@ -255,7 +256,7 @@ static int isofs_readdir(struct file *file, struct dir_context *ctx)
>  	struct iso_directory_record *tmpde;
>  	struct inode *inode = file_inode(file);
>  
> -	tmpname = (char *)__get_free_page(GFP_KERNEL);
> +	tmpname = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (tmpname == NULL)
>  		return -ENOMEM;
>  
> @@ -263,7 +264,7 @@ static int isofs_readdir(struct file *file, struct dir_context *ctx)
>  
>  	result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
>  
> -	free_page((unsigned long) tmpname);
> +	kfree(tmpname);
>  	return result;
>  }
>  
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 10/17] jbd2: replace __get_free_pages() with kmalloc()
From: Jan Kara @ 2026-05-25 16:17 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-10-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:22, Mike Rapoport (Microsoft) wrote:
> jbd2_alloc() falls back from kmem_cache_alloc() to __get_free_pages() for
> allocations larger than PAGE_SIZE.
> But kmalloc() can handle such cases with essentially the same fallback.
> 
> Replace use of __get_free_pages() with kmalloc() and simplify
> jbd2_free() as both kmem_cache_alloc() and kmalloc() allocations can be
> freed with kfree().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

I'll just note that we allocate here fs block size large buffer so the same
kind of allocator as we use for folios would be even better. But that's a
different cleanup I guess.

								Honza

> ---
>  fs/jbd2/journal.c | 7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 4f397fcdb13c..1137b471e490 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -2784,7 +2784,7 @@ void *jbd2_alloc(size_t size, gfp_t flags)
>  	if (size < PAGE_SIZE)
>  		ptr = kmem_cache_alloc(get_slab(size), flags);
>  	else
> -		ptr = (void *)__get_free_pages(flags, get_order(size));
> +		ptr = kmalloc(size, flags);
>  
>  	/* Check alignment; SLUB has gotten this wrong in the past,
>  	 * and this can lead to user data corruption! */
> @@ -2795,10 +2795,7 @@ void *jbd2_alloc(size_t size, gfp_t flags)
>  
>  void jbd2_free(void *ptr, size_t size)
>  {
> -	if (size < PAGE_SIZE)
> -		kmem_cache_free(get_slab(size), ptr);
> -	else
> -		free_pages((unsigned long)ptr, get_order(size));
> +	kfree(ptr);
>  };
>  
>  /*
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 03/17] ocfs2/dlm: replace __get_free_page() with kmalloc()
From: Jan Kara @ 2026-05-25 16:13 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-3-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:15, Mike Rapoport (Microsoft) wrote:
> A few places in ocsfs2 allocate temporary buffers with __get_free_page() or
> get_zeroed_page().
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_page() and get_zeroed_page() with kmalloc() and
> kzalloc() respectively.
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ocfs2/dlm/dlmdebug.c    | 24 +++++++++---------------
>  fs/ocfs2/dlm/dlmdomain.c   |  8 +++++---
>  fs/ocfs2/dlm/dlmmaster.c   |  5 ++---
>  fs/ocfs2/dlm/dlmrecovery.c |  4 ++--
>  4 files changed, 18 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
> index fe4fdd09bae3..6ca8b3b68eef 100644
> --- a/fs/ocfs2/dlm/dlmdebug.c
> +++ b/fs/ocfs2/dlm/dlmdebug.c
> @@ -260,10 +260,10 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
>  {
>  	char *buf;
>  
> -	buf = (char *) get_zeroed_page(GFP_ATOMIC);
> +	buf = kzalloc(PAGE_SIZE, GFP_ATOMIC);
>  	if (buf) {
>  		dump_mle(mle, buf, PAGE_SIZE - 1);
> -		free_page((unsigned long)buf);
> +		kfree(buf);
>  	}
>  }
>  
> @@ -280,7 +280,7 @@ static struct dentry *dlm_debugfs_root;
>  /* begin - utils funcs */
>  static int debug_release(struct inode *inode, struct file *file)
>  {
> -	free_page((unsigned long)file->private_data);
> +	kfree(file->private_data);
>  	return 0;
>  }
>  
> @@ -327,17 +327,15 @@ static int debug_purgelist_open(struct inode *inode, struct file *file)
>  	struct dlm_ctxt *dlm = inode->i_private;
>  	char *buf = NULL;
>  
> -	buf = (char *) get_zeroed_page(GFP_NOFS);
> +	buf = kzalloc(PAGE_SIZE, GFP_NOFS);
>  	if (!buf)
> -		goto bail;
> +		return -ENOMEM;
>  
>  	i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
>  
>  	file->private_data = buf;
>  
>  	return 0;
> -bail:
> -	return -ENOMEM;
>  }
>  
>  static const struct file_operations debug_purgelist_fops = {
> @@ -384,17 +382,15 @@ static int debug_mle_open(struct inode *inode, struct file *file)
>  	struct dlm_ctxt *dlm = inode->i_private;
>  	char *buf = NULL;
>  
> -	buf = (char *) get_zeroed_page(GFP_NOFS);
> +	buf = kzalloc(PAGE_SIZE, GFP_NOFS);
>  	if (!buf)
> -		goto bail;
> +		return -ENOMEM;
>  
>  	i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
>  
>  	file->private_data = buf;
>  
>  	return 0;
> -bail:
> -	return -ENOMEM;
>  }
>  
>  static const struct file_operations debug_mle_fops = {
> @@ -775,17 +771,15 @@ static int debug_state_open(struct inode *inode, struct file *file)
>  	struct dlm_ctxt *dlm = inode->i_private;
>  	char *buf = NULL;
>  
> -	buf = (char *) get_zeroed_page(GFP_NOFS);
> +	buf = kzalloc(PAGE_SIZE, GFP_NOFS);
>  	if (!buf)
> -		goto bail;
> +		return -ENOMEM;
>  
>  	i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
>  
>  	file->private_data = buf;
>  
>  	return 0;
> -bail:
> -	return -ENOMEM;
>  }
>  
>  static const struct file_operations debug_state_fops = {
> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
> index dc9da9133c8e..97bb9400e24b 100644
> --- a/fs/ocfs2/dlm/dlmdomain.c
> +++ b/fs/ocfs2/dlm/dlmdomain.c
> @@ -63,7 +63,7 @@ static inline void byte_copymap(u8 dmap[], unsigned long smap[],
>  static void dlm_free_pagevec(void **vec, int pages)
>  {
>  	while (pages--)
> -		free_page((unsigned long)vec[pages]);
> +		kfree(vec[pages]);
>  	kfree(vec);
>  }
>  
> @@ -75,9 +75,11 @@ static void **dlm_alloc_pagevec(int pages)
>  	if (!vec)
>  		return NULL;
>  
> -	for (i = 0; i < pages; i++)
> -		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
> +	for (i = 0; i < pages; i++) {
> +		vec[i] = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +		if (!vec[i])
>  			goto out_free;
> +	}
>  
>  	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
>  	     pages, (unsigned long)DLM_HASH_PAGES,
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index 93eff38fdadd..aee3b4c56dcc 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -2548,7 +2548,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
>  
>  	/* preallocate up front. if this fails, abort */
>  	ret = -ENOMEM;
> -	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
> +	mres = kmalloc(PAGE_SIZE, GFP_NOFS);
>  	if (!mres) {
>  		mlog_errno(ret);
>  		goto leave;
> @@ -2725,8 +2725,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
>  	if (wake)
>  		wake_up(&res->wq);
>  
> -	if (mres)
> -		free_page((unsigned long)mres);
> +	kfree(mres);
>  
>  	dlm_put(dlm);
>  
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 128872bd945d..9b97bf73df22 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -837,7 +837,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
>  	}
>  
>  	/* this will get freed by dlm_request_all_locks_worker */
> -	buf = (char *) __get_free_page(GFP_NOFS);
> +	buf = kmalloc(PAGE_SIZE, GFP_NOFS);
>  	if (!buf) {
>  		kfree(item);
>  		dlm_put(dlm);
> @@ -933,7 +933,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
>  		}
>  	}
>  leave:
> -	free_page((unsigned long)data);
> +	kfree(data);
>  }
>  
>  
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 02/17] proc: replace __get_free_page() with kmalloc()
From: Jan Kara @ 2026-05-25 16:11 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-2-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:14, Mike Rapoport (Microsoft) wrote:
> A few functions in fs/proc/base.c use __get_free_page() to allocate a
> temporary buffer.
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_page() with kmalloc().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/proc/base.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index d9acfa89c894..e129dc509b79 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -261,7 +261,7 @@ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
>  	if (pos >= PAGE_SIZE)
>  		return 0;
>  
> -	page = (char *)__get_free_page(GFP_KERNEL);
> +	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -284,7 +284,7 @@ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
>  			ret = len;
>  		}
>  	}
> -	free_page((unsigned long)page);
> +	kfree(page);
>  	return ret;
>  }
>  
> @@ -347,7 +347,7 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
>  	if (count > arg_end - pos)
>  		count = arg_end - pos;
>  
> -	page = (char *)__get_free_page(GFP_KERNEL);
> +	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -371,7 +371,7 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
>  		count -= got;
>  	}
>  
> -	free_page((unsigned long)page);
> +	kfree(page);
>  	return len;
>  }
>  
> @@ -908,7 +908,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
>  	if (!mm)
>  		return 0;
>  
> -	page = (char *)__get_free_page(GFP_KERNEL);
> +	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -949,7 +949,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
>  
>  	mmput(mm);
>  free:
> -	free_page((unsigned long) page);
> +	kfree(page);
>  	return copied;
>  }
>  
> @@ -1016,7 +1016,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
>  	if (!mm || !mm->env_end)
>  		return 0;
>  
> -	page = (char *)__get_free_page(GFP_KERNEL);
> +	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!page)
>  		return -ENOMEM;
>  
> @@ -1062,7 +1062,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
>  	mmput(mm);
>  
>  free:
> -	free_page((unsigned long) page);
> +	kfree(page);
>  	return ret;
>  }
>  
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 01/17] quota: allocate dquot_hash with kmalloc()
From: Jan Kara @ 2026-05-25 16:10 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian, linux-kernel, linux-fsdevel,
	ocfs2-devel, linux-nilfs, linux-nfs, jfs-discussion, linux-ext4,
	linux-mm
In-Reply-To: <20260523-b4-fs-v1-1-275e36a83f0e@kernel.org>

On Sat 23-05-26 20:54:13, Mike Rapoport (Microsoft) wrote:
> dquot_init() allocates a single page for dquot_hash with
> __get_free_pages().
> 
> kmalloc() is a better API for such use and it also provides better
> scalability and more debugging possibilities.
> 
> Replace use of __get_free_pages() with kmalloc() and get rid of the order
> variable that remained 0 for more than 20 years.
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Thanks! I've added this patch to my tree.

								Honza

> ---
>  fs/quota/dquot.c | 11 +++++------
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 64cf42721496..9850de3955d3 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -3022,7 +3022,7 @@ static const struct ctl_table fs_dqstats_table[] = {
>  static int __init dquot_init(void)
>  {
>  	int i, ret;
> -	unsigned long nr_hash, order;
> +	unsigned long nr_hash;
>  	struct shrinker *dqcache_shrinker;
>  
>  	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
> @@ -3035,8 +3035,7 @@ static int __init dquot_init(void)
>  				SLAB_PANIC),
>  			NULL);
>  
> -	order = 0;
> -	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
> +	dquot_hash = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  	if (!dquot_hash)
>  		panic("Cannot create dquot hash table");
>  
> @@ -3046,7 +3045,7 @@ static int __init dquot_init(void)
>  		panic("Cannot create dquot stat counters");
>  
>  	/* Find power-of-two hlist_heads which can fit into allocation */
> -	nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
> +	nr_hash = PAGE_SIZE / sizeof(struct hlist_head);
>  	dq_hash_bits = ilog2(nr_hash);
>  
>  	nr_hash = 1UL << dq_hash_bits;
> @@ -3054,8 +3053,8 @@ static int __init dquot_init(void)
>  	for (i = 0; i < nr_hash; i++)
>  		INIT_HLIST_HEAD(dquot_hash + i);
>  
> -	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
> -		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
> +	pr_info("VFS: Dquot-cache hash table entries: %ld (%ld bytes)\n",
> +		nr_hash, PAGE_SIZE);
>  
>  	dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
>  	if (!dqcache_shrinker)
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Jan Kara @ 2026-05-25 15:42 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260522030540.3896201-1-yi.zhang@huaweicloud.com>

On Fri 22-05-26 11:05:40, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> jbd2_journal_try_to_free_buffers() currently only tries to remove
> checkpointed data buffers from the checkpoint list for data=journal
> mode, and bails out if any buffer is still attached to a transaction.
> For data=ordered and writeback modes, data buffers never have
> journal_heads, so the function degenerates to a plain
> try_to_free_buffers() call.
> 
> Besides, The release of metadata buffers has been delegated to the jbd2
> journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
> release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
> not used for handling metadata buffers now.
> 
> However, the comment above the function still references
> jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
> BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
> ("jbd2: Remove data=ordered mode support using jbd buffer heads").
> 
> Replace it with a description of what the function actually does now.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Thanks for the update. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/jbd2/transaction.c | 39 ++++++++++++---------------------------
>  1 file changed, 12 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..239bcf88ed1c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
>  }
>  
>  /**
> - * jbd2_journal_try_to_free_buffers() - try to free page buffers.
> + * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
>   * @journal: journal for operation
>   * @folio: Folio to detach data from.
>   *
> - * For all the buffers on this page,
> - * if they are fully written out ordered data, move them onto BUF_CLEAN
> - * so try_to_free_buffers() can reap them.
> + * For each buffer_head on @folio, if the buffer has a journal_head but
> + * is not attached to a running or committing transaction, try to remove
> + * it from the checkpoint list.  This is needed for data=journal mode
> + * where data buffers are journaled: once they are checkpointed, the
> + * journal_head can be detached and the buffer freed.  If any buffer is
> + * still attached to a transaction, the folio cannot be released and we
> + * bail out.  Otherwise we call try_to_free_buffers() to detach all
> + * buffer_heads from the folio.
>   *
> - * This function returns non-zero if we wish try_to_free_buffers()
> - * to be called. We do this if the page is releasable by try_to_free_buffers().
> - * We also do it if the page has locked or dirty buffers and the caller wants
> - * us to perform sync or async writeout.
> + * For data=ordered and writeback modes, data buffers never have
> + * journal_heads, so this degenerates to a plain try_to_free_buffers().
>   *
> - * This complicates JBD locking somewhat.  We aren't protected by the
> - * BKL here.  We wish to remove the buffer from its committing or
> - * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
> - *
> - * This may *change* the value of transaction_t->t_datalist, so anyone
> - * who looks at t_datalist needs to lock against this function.
> - *
> - * Even worse, someone may be doing a jbd2_journal_dirty_data on this
> - * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
> - * will come out of the lock with the buffer dirty, which makes it
> - * ineligible for release here.
> - *
> - * Who else is affected by this?  hmm...  Really the only contender
> - * is do_get_write_access() - it could be looking at the buffer while
> - * journal_try_to_free_buffer() is changing its state.  But that
> - * cannot happen because we never reallocate freed data as metadata
> - * while the data is part of a transaction.  Yes?
> - *
> - * Return false on failure, true on success
> + * Return: true if the folio's buffers were freed, false otherwise
>   */
>  bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
>  {
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 2/2] ext4: get ext4_group_desc in ext4_mb_prefetch only when necessary
From: Jan Kara @ 2026-05-25 15:32 UTC (permalink / raw)
  To: Bohdan Trach
  Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, mchehab+huawei,
	lilith.oberhauser, linux-ext4, linux-kernel
In-Reply-To: <20260521125931.16474-3-bohdan.trach@huaweicloud.com>

On Thu 21-05-26 14:59:29, Bohdan Trach wrote:
> Getting ext4_group_desc structure can contribute to the cost of
> ext4_mb_prefetch() without any need, as most groups fail the
> !EXT4_MB_GRP_TEST_AND_SET_READ check.
> 
> Optimize ext4_mb_prefetch by getting the group description only when
> necessary.
> 
> The result is further increase in performance of fallocate() system call
> path that triggers ext4_mb_prefetch() via a linear group scan.
> 
> Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>

Looks good. Just one nit below:

> @@ -2872,14 +2870,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
>  		 * prefetch once, so we avoid getblk() call, which can
>  		 * be expensive.
>  		 */
> -		if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
> -		    EXT4_MB_GRP_NEED_INIT(grp) &&
> -		    ext4_free_group_clusters(sb, gdp) > 0 ) {
> -			bh = ext4_read_block_bitmap_nowait(sb, group, true);
> -			if (!IS_ERR_OR_NULL(bh)) {
> -				if (!buffer_uptodate(bh) && cnt)
> -					(*cnt)++;
> -				brelse(bh);
> +		if (group < ngroups && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&

The group < ngroup should be always true and is also implicitly contained
in the 'grp != NULL' check. So I'd remove that bit. Otherwise feel free to
add:

Reviewed-by: Jan Kara <jack@suse.cz>

									Honza

> +		    EXT4_MB_GRP_NEED_INIT(grp)) {
> +			struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
> +
> +			if (gdp && ext4_free_group_clusters(sb, gdp) > 0) {
> +				bh = ext4_read_block_bitmap_nowait(sb, group, true);
> +				if (!IS_ERR_OR_NULL(bh)) {
> +					if (!buffer_uptodate(bh) && cnt)
> +						(*cnt)++;
> +					brelse(bh);
> +				}
>  			}
>  		}
>  		if (++group >= ngroups)
> -- 
> 2.43.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Jan Kara @ 2026-05-25 15:28 UTC (permalink / raw)
  To: Bohdan Trach
  Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, mchehab+huawei,
	lilith.oberhauser, linux-ext4, linux-kernel
In-Reply-To: <20260521125931.16474-2-bohdan.trach@huaweicloud.com>

On Thu 21-05-26 14:59:28, Bohdan Trach wrote:
> EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which
> issues an atomic write. This can cause high overhead due to cache
> contention when multiple threads iterate over groups in a tight loop,
> as is the case for ext4_mb_prefetch(). We have seen this to be a
> problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction
> for this purpose.
> 
> This change significantly reduces costs of fallocate() operations which
> trigger linear group scans on large multicore machines where
> test_and_set_bit issues an atomic write operation unconditionally.
> 
> Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
...
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56b82d4a15d7..0713207811a6 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3551,7 +3551,17 @@ struct ext4_group_info {
>  #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
>  	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
>  #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
> -	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
> +	(ext4_mb_grp_test_and_set_read((grp)))
> +
> +static inline int ext4_mb_grp_test_and_set_read(struct ext4_group_info *grp)
> +{
> +	int r = test_bit_acquire(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state);
> +
> +	if (!r)
> +		return test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state);
> +	else
> +		return r;
> +}

Good idea but do we really need the 'acquire' barrier here? I don't see
anything that would really need this so I think
EXT4_MB_GRP_TEST_AND_SET_READ() can be just:

test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state) || \
  test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state)

or am I missing something?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH] ext4: convert legacy ext4_debug() to standard pr_debug()
From: Jan Kara @ 2026-05-25 15:16 UTC (permalink / raw)
  To: lirongqing
  Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260521074634.2697-1-lirongqing@baidu.com>

On Thu 21-05-26 03:46:34, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
> 
> The ext4 file system historically implemented its own debug logging macro
> ext4_debug() via EXT4FS_DEBUG conditional compilation. This legacy
> implementation suffers from two major drawbacks:
> 
> 1. It makes two consecutive un-serialized printk() calls, which can
>    lead to severe log interleaving and corruption under multi-core
>    concurrent workloads.
> 2. It completely bypasses the standard modern kernel dynamic debug
>    (CONFIG_DYNAMIC_DEBUG) infrastructure.
> 
> Clean up the legacy implementation by leveraging pr_debug(). This squashes
> the multiple printk() calls into a single atomic execution, ensuring
> log integrity, while seamlessly hooking ext4 into the kernel's native
> dynamic debug framework.
> 
> The redundant __FILE__ and __LINE__ macros are intentionally removed from
> the string format because the dynamic debug infrastructure can already
> append them automatically at runtime (via the '+fl' flags) if desired.
> This avoids redundancy and double-logging in modern production/debugging
> environments while keeping the macro clean and robust against dangling
> comma compiler errors.
> 
> Signed-off-by: Li RongQing <lirongqing@baidu.com>

Nice! Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza


> ---
>  fs/ext4/ext4.h | 20 ++------------------
>  1 file changed, 2 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a9..39e86ff 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -62,24 +62,8 @@
>   */
>  #define DOUBLE_CHECK__
>  
> -/*
> - * Define EXT4FS_DEBUG to produce debug messages
> - */
> -#undef EXT4FS_DEBUG
> -
> -/*
> - * Debug code
> - */
> -#ifdef EXT4FS_DEBUG
> -#define ext4_debug(f, a...)						\
> -	do {								\
> -		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
> -			__FILE__, __LINE__, __func__);			\
> -		printk(KERN_DEBUG f, ## a);				\
> -	} while (0)
> -#else
> -#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
> -#endif
> +#define ext4_debug(fmt, ...)						\
> +	pr_debug("EXT4-fs DEBUG %s: " fmt, __func__,  ##__VA_ARGS__)
>  
>   /*
>    * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
> -- 
> 2.9.4
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox