[PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation

public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation
@ 2023-11-16  2:28 Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 02/17] xfs: don't leak xfs_buf_cancel structures when recovery fails Leah Rumancik
                   ` (15 more replies)
  0 siblings, 16 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Christoph Hellwig, Dave Chinner, Dave Chinner, Leah Rumancik,
	Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 2723234923b3294dbcf6019c288c87465e927ed4 ]

Move the code that allocates and frees the buffer cancellation tables
used by log recovery into the file that actually uses the tables.  This
is a precursor to some cleanups and a memory leak fix.

( backport: dependency of 8db074bd84df5ccc88bff3f8f900f66f4b8349fa )

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_log_recover.h | 14 +++++-----
 fs/xfs/xfs_buf_item_recover.c   | 47 +++++++++++++++++++++++++++++++++
 fs/xfs/xfs_log_priv.h           |  3 ---
 fs/xfs/xfs_log_recover.c        | 32 +++++++---------------
 4 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index ff69a0000817..b8b65a6e9b1e 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -108,12 +108,6 @@ struct xlog_recover {
 
 #define ITEM_TYPE(i)	(*(unsigned short *)(i)->ri_buf[0].i_addr)
 
-/*
- * This is the number of entries in the l_buf_cancel_table used during
- * recovery.
- */
-#define	XLOG_BC_TABLE_SIZE	64
-
 #define	XLOG_RECOVER_CRCPASS	0
 #define	XLOG_RECOVER_PASS1	1
 #define	XLOG_RECOVER_PASS2	2
@@ -126,5 +120,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
 		struct xfs_inode **ipp);
 void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
 		uint64_t intent_id);
+void xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
 
 #endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index e04e44ef14c6..dc099b2f4984 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -23,6 +23,15 @@
 #include "xfs_dir2.h"
 #include "xfs_quota.h"
 
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define	XLOG_BC_TABLE_SIZE	64
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+	((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
 /*
  * This structure is used during recovery to record the buf log items which
  * have been canceled and should not be replayed.
@@ -1003,3 +1012,41 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = {
 	.commit_pass1		= xlog_recover_buf_commit_pass1,
 	.commit_pass2		= xlog_recover_buf_commit_pass2,
 };
+
+#ifdef DEBUG
+void
+xlog_check_buf_cancel_table(
+	struct xlog	*log)
+{
+	int		i;
+
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+}
+#endif
+
+void
+xlog_alloc_buf_cancel_table(
+	struct xlog	*log)
+{
+	int		i;
+
+	ASSERT(log->l_buf_cancel_table == NULL);
+
+	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(struct list_head),
+						 0);
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+}
+
+void
+xlog_free_buf_cancel_table(
+	struct xlog	*log)
+{
+	if (!log->l_buf_cancel_table)
+		return;
+
+	kmem_free(log->l_buf_cancel_table);
+	log->l_buf_cancel_table = NULL;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d68ca39f45..03393595676f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -454,9 +454,6 @@ struct xlog {
 	struct rw_semaphore	l_incompat_users;
 };
 
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
-	((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-
 /*
  * Bits for operational state
  */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 581aeb288b32..18d8eebc2d44 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3248,7 +3248,7 @@ xlog_do_log_recovery(
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
-	int		error, i;
+	int		error;
 
 	ASSERT(head_blk != tail_blk);
 
@@ -3256,37 +3256,23 @@ xlog_do_log_recovery(
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
-	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-						 sizeof(struct list_head),
-						 0);
-	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+	xlog_alloc_buf_cancel_table(log);
 
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1, NULL);
-	if (error != 0) {
-		kmem_free(log->l_buf_cancel_table);
-		log->l_buf_cancel_table = NULL;
-		return error;
-	}
+	if (error != 0)
+		goto out_cancel;
+
 	/*
 	 * Then do a second pass to actually recover the items in the log.
 	 * When it is complete free the table of buf cancel items.
 	 */
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS2, NULL);
-#ifdef DEBUG
-	if (!error) {
-		int	i;
-
-		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
-	}
-#endif	/* DEBUG */
-
-	kmem_free(log->l_buf_cancel_table);
-	log->l_buf_cancel_table = NULL;
-
+	if (!error)
+		xlog_check_buf_cancel_table(log);
+out_cancel:
+	xlog_free_buf_cancel_table(log);
 	return error;
 }
 
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 02/17] xfs: don't leak xfs_buf_cancel structures when recovery fails
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 03/17] xfs: convert buf_cancel_table allocation to kmalloc_array Leah Rumancik
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Christoph Hellwig, Dave Chinner, Dave Chinner, Leah Rumancik,
	Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 8db074bd84df5ccc88bff3f8f900f66f4b8349fa ]

If log recovery fails, we free the memory used by the buffer
cancellation buckets, but we don't actually traverse each bucket list to
free the individual xfs_buf_cancel objects.  This leads to a memory
leak, as reported by kmemleak in xfs/051:

unreferenced object 0xffff888103629560 (size 32):
  comm "mount", pid 687045, jiffies 4296935916 (age 10.752s)
  hex dump (first 32 bytes):
    08 d3 0a 01 00 00 00 00 08 00 00 00 01 00 00 00  ................
    d0 f5 0b 92 81 88 ff ff 80 64 64 25 81 88 ff ff  .........dd%....
  backtrace:
    [<ffffffffa0317c83>] kmem_alloc+0x73/0x140 [xfs]
    [<ffffffffa03234a9>] xlog_recover_buf_commit_pass1+0x139/0x200 [xfs]
    [<ffffffffa032dc27>] xlog_recover_commit_trans+0x307/0x350 [xfs]
    [<ffffffffa032df15>] xlog_recovery_process_trans+0xa5/0xe0 [xfs]
    [<ffffffffa032e12d>] xlog_recover_process_data+0x8d/0x140 [xfs]
    [<ffffffffa032e49d>] xlog_do_recovery_pass+0x19d/0x740 [xfs]
    [<ffffffffa032f22d>] xlog_do_log_recovery+0x6d/0x150 [xfs]
    [<ffffffffa032f343>] xlog_do_recover+0x33/0x1d0 [xfs]
    [<ffffffffa032faba>] xlog_recover+0xda/0x190 [xfs]
    [<ffffffffa03194bc>] xfs_log_mount+0x14c/0x360 [xfs]
    [<ffffffffa030bfed>] xfs_mountfs+0x50d/0xa60 [xfs]
    [<ffffffffa03124b5>] xfs_fs_fill_super+0x6a5/0x950 [xfs]
    [<ffffffff812b92a5>] get_tree_bdev+0x175/0x280
    [<ffffffff812b7c3a>] vfs_get_tree+0x1a/0x80
    [<ffffffff812e366f>] path_mount+0x6ff/0xaa0
    [<ffffffff812e3b13>] __x64_sys_mount+0x103/0x140

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_buf_item_recover.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index dc099b2f4984..635f7f8ed9c2 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1044,9 +1044,22 @@ void
 xlog_free_buf_cancel_table(
 	struct xlog	*log)
 {
+	int		i;
+
 	if (!log->l_buf_cancel_table)
 		return;
 
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+		struct xfs_buf_cancel	*bc;
+
+		while ((bc = list_first_entry_or_null(
+				&log->l_buf_cancel_table[i],
+				struct xfs_buf_cancel, bc_list))) {
+			list_del(&bc->bc_list);
+			kmem_free(bc);
+		}
+	}
+
 	kmem_free(log->l_buf_cancel_table);
 	log->l_buf_cancel_table = NULL;
 }
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 03/17] xfs: convert buf_cancel_table allocation to kmalloc_array
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 02/17] xfs: don't leak xfs_buf_cancel structures when recovery fails Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 04/17] xfs: use invalidate_lock to check the state of mmap_lock Leah Rumancik
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Christoph Hellwig, Dave Chinner, Dave Chinner, Leah Rumancik,
	Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 910bbdf2f4d7df46781bc9b723048f5ebed3d0d7 ]

While we're messing around with how recovery allocates and frees the
buffer cancellation table, convert the allocation to use kmalloc_array
instead of the old kmem_alloc APIs, and make it handle a null return,
even though that's not likely.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_log_recover.h |  2 +-
 fs/xfs/xfs_buf_item_recover.c   | 14 ++++++++++----
 fs/xfs/xfs_log_recover.c        |  4 +++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index b8b65a6e9b1e..81a065b0b571 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -120,7 +120,7 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
 		struct xfs_inode **ipp);
 void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
 		uint64_t intent_id);
-void xlog_alloc_buf_cancel_table(struct xlog *log);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
 void xlog_free_buf_cancel_table(struct xlog *log);
 
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 635f7f8ed9c2..31cbe7deebfa 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1025,19 +1025,25 @@ xlog_check_buf_cancel_table(
 }
 #endif
 
-void
+int
 xlog_alloc_buf_cancel_table(
 	struct xlog	*log)
 {
+	void		*p;
 	int		i;
 
 	ASSERT(log->l_buf_cancel_table == NULL);
 
-	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-						 sizeof(struct list_head),
-						 0);
+	p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head),
+			  GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	log->l_buf_cancel_table = p;
 	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+	return 0;
 }
 
 void
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 18d8eebc2d44..aeb01d4c0423 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3256,7 +3256,9 @@ xlog_do_log_recovery(
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
-	xlog_alloc_buf_cancel_table(log);
+	error = xlog_alloc_buf_cancel_table(log);
+	if (error)
+		return error;
 
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1, NULL);
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 04/17] xfs: use invalidate_lock to check the state of mmap_lock
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 02/17] xfs: don't leak xfs_buf_cancel structures when recovery fails Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 03/17] xfs: convert buf_cancel_table allocation to kmalloc_array Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 05/17] xfs: prevent a UAF when log IO errors race with unmount Leah Rumancik
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Kaixu Xia, Dave Chinner,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Kaixu Xia <kaixuxia@tencent.com>

[ Upstream commit 82af88063961da9425924d9aec3fb67a4ebade3e ]

We should use invalidate_lock and XFS_MMAPLOCK_SHARED to check the state
of mmap_lock rw_semaphore in xfs_isilocked(), rather than i_rwsem and
XFS_IOLOCK_SHARED.

Fixes: 2433480a7e1d ("xfs: Convert to use invalidate_lock")
Signed-off-by: Kaixu Xia <kaixuxia@tencent.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b2ea85318214..df64b902842d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -378,8 +378,8 @@ xfs_isilocked(
 	}
 
 	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
-		return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
-				(lock_flags & XFS_IOLOCK_SHARED));
+		return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+				(lock_flags & XFS_MMAPLOCK_SHARED));
 	}
 
 	if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 05/17] xfs: prevent a UAF when log IO errors race with unmount
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (2 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 04/17] xfs: use invalidate_lock to check the state of mmap_lock Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 06/17] xfs: flush inode gc workqueue before clearing agi bucket Leah Rumancik
                   ` (11 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Dave Chinner, Leah Rumancik, Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 7561cea5dbb97fecb952548a0fb74fb105bf4664 ]

KASAN reported the following use after free bug when running
generic/475:

 XFS (dm-0): Mounting V5 Filesystem
 XFS (dm-0): Starting recovery (logdev: internal)
 XFS (dm-0): Ending recovery (logdev: internal)
 Buffer I/O error on dev dm-0, logical block 20639616, async page read
 Buffer I/O error on dev dm-0, logical block 20639617, async page read
 XFS (dm-0): log I/O error -5
 XFS (dm-0): Filesystem has been shut down due to log error (0x2).
 XFS (dm-0): Unmounting Filesystem
 XFS (dm-0): Please unmount the filesystem and rectify the problem(s).
 ==================================================================
 BUG: KASAN: use-after-free in do_raw_spin_lock+0x246/0x270
 Read of size 4 at addr ffff888109dd84c4 by task 3:1H/136

 CPU: 3 PID: 136 Comm: 3:1H Not tainted 5.19.0-rc4-xfsx #rc4 8e53ab5ad0fddeb31cee5e7063ff9c361915a9c4
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
 Workqueue: xfs-log/dm-0 xlog_ioend_work [xfs]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x34/0x44
  print_report.cold+0x2b8/0x661
  ? do_raw_spin_lock+0x246/0x270
  kasan_report+0xab/0x120
  ? do_raw_spin_lock+0x246/0x270
  do_raw_spin_lock+0x246/0x270
  ? rwlock_bug.part.0+0x90/0x90
  xlog_force_shutdown+0xf6/0x370 [xfs 4ad76ae0d6add7e8183a553e624c31e9ed567318]
  xlog_ioend_work+0x100/0x190 [xfs 4ad76ae0d6add7e8183a553e624c31e9ed567318]
  process_one_work+0x672/0x1040
  worker_thread+0x59b/0xec0
  ? __kthread_parkme+0xc6/0x1f0
  ? process_one_work+0x1040/0x1040
  ? process_one_work+0x1040/0x1040
  kthread+0x29e/0x340
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>

 Allocated by task 154099:
  kasan_save_stack+0x1e/0x40
  __kasan_kmalloc+0x81/0xa0
  kmem_alloc+0x8d/0x2e0 [xfs]
  xlog_cil_init+0x1f/0x540 [xfs]
  xlog_alloc_log+0xd1e/0x1260 [xfs]
  xfs_log_mount+0xba/0x640 [xfs]
  xfs_mountfs+0xf2b/0x1d00 [xfs]
  xfs_fs_fill_super+0x10af/0x1910 [xfs]
  get_tree_bdev+0x383/0x670
  vfs_get_tree+0x7d/0x240
  path_mount+0xdb7/0x1890
  __x64_sys_mount+0x1fa/0x270
  do_syscall_64+0x2b/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

 Freed by task 154151:
  kasan_save_stack+0x1e/0x40
  kasan_set_track+0x21/0x30
  kasan_set_free_info+0x20/0x30
  ____kasan_slab_free+0x110/0x190
  slab_free_freelist_hook+0xab/0x180
  kfree+0xbc/0x310
  xlog_dealloc_log+0x1b/0x2b0 [xfs]
  xfs_unmountfs+0x119/0x200 [xfs]
  xfs_fs_put_super+0x6e/0x2e0 [xfs]
  generic_shutdown_super+0x12b/0x3a0
  kill_block_super+0x95/0xd0
  deactivate_locked_super+0x80/0x130
  cleanup_mnt+0x329/0x4d0
  task_work_run+0xc5/0x160
  exit_to_user_mode_prepare+0xd4/0xe0
  syscall_exit_to_user_mode+0x1d/0x40
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

This appears to be a race between the unmount process, which frees the
CIL and waits for in-flight iclog IO; and the iclog IO completion.  When
generic/475 runs, it starts fsstress in the background, waits a few
seconds, and substitutes a dm-error device to simulate a disk falling
out of a machine.  If the fsstress encounters EIO on a pure data write,
it will exit but the filesystem will still be online.

The next thing the test does is unmount the filesystem, which tries to
clean the log, free the CIL, and wait for iclog IO completion.  If an
iclog was being written when the dm-error switch occurred, it can race
with log unmounting as follows:

Thread 1				Thread 2

					xfs_log_unmount
					xfs_log_clean
					xfs_log_quiesce
xlog_ioend_work
<observe error>
xlog_force_shutdown
test_and_set_bit(XLOG_IOERROR)
					xfs_log_force
					<log is shut down, nop>
					xfs_log_umount_write
					<log is shut down, nop>
					xlog_dealloc_log
					xlog_cil_destroy
					<wait for iclogs>
spin_lock(&log->l_cilp->xc_push_lock)
<KABOOM>

Therefore, free the CIL after waiting for the iclogs to complete.  I
/think/ this race has existed for quite a few years now, though I don't
remember the ~2014 era logging code well enough to know if it was a real
threat then or if the actual race was exposed only more recently.

Fixes: ac983517ec59 ("xfs: don't sleep in xlog_cil_force_lsn on shutdown")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_log.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0fb7d05ca308..eba295f666ac 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2061,8 +2061,6 @@ xlog_dealloc_log(
 	xlog_in_core_t	*iclog, *next_iclog;
 	int		i;
 
-	xlog_cil_destroy(log);
-
 	/*
 	 * Cycle all the iclogbuf locks to make sure all log IO completion
 	 * is done before we tear down these buffers.
@@ -2074,6 +2072,13 @@ xlog_dealloc_log(
 		iclog = iclog->ic_next;
 	}
 
+	/*
+	 * Destroy the CIL after waiting for iclog IO completion because an
+	 * iclog EIO error will try to shut down the log, which accesses the
+	 * CIL to wake up the waiters.
+	 */
+	xlog_cil_destroy(log);
+
 	iclog = log->l_iclog;
 	for (i = 0; i < log->l_iclog_bufs; i++) {
 		next_iclog = iclog->ic_next;
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 06/17] xfs: flush inode gc workqueue before clearing agi bucket
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (3 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 05/17] xfs: prevent a UAF when log IO errors race with unmount Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 07/17] xfs: fix use-after-free in xattr node block inactivation Leah Rumancik
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Zhang Yi, Dave Chinner,
	Darrick J . Wong, Dave Chinner, Leah Rumancik, Chandan Babu R

From: Zhang Yi <yi.zhang@huawei.com>

[ Upstream commit 04a98a036cf8b810dda172a9dcfcbd783bf63655 ]

In the procedure of recover AGI unlinked lists, if something bad
happenes on one of the unlinked inode in the bucket list, we would call
xlog_recover_clear_agi_bucket() to clear the whole unlinked bucket list,
not the unlinked inodes after the bad one. If we have already added some
inodes to the gc workqueue before the bad inode in the list, we could
get below error when freeing those inodes, and finaly fail to complete
the log recover procedure.

 XFS (ram0): Internal error xfs_iunlink_remove at line 2456 of file
 fs/xfs/xfs_inode.c.  Caller xfs_ifree+0xb0/0x360 [xfs]

The problem is xlog_recover_clear_agi_bucket() clear the bucket list, so
the gc worker fail to check the agino in xfs_verify_agino(). Fix this by
flush workqueue before clearing the bucket.

Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_log_recover.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aeb01d4c0423..04961ebf16ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2739,6 +2739,7 @@ xlog_recover_process_one_iunlink(
 	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
 	 * clear the inode pointer in the bucket.
 	 */
+	xfs_inodegc_flush(mp);
 	xlog_recover_clear_agi_bucket(mp, agno, bucket);
 	return NULLAGINO;
 }
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 07/17] xfs: fix use-after-free in xattr node block inactivation
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (4 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 06/17] xfs: flush inode gc workqueue before clearing agi bucket Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 08/17] xfs: don't leak memory when attr fork loading fails Leah Rumancik
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong, hch,
	kernel test robot, Dave Chinner, Leah Rumancik, Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 95ff0363f3f6ae70c21a0f2b0603e54438e5988b ]

The kernel build robot reported a UAF error while running xfs/433
(edited somewhat for brevity):

 BUG: KASAN: use-after-free in xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs
 Read of size 4 at addr ffff88820ac2bd44 by task kworker/0:2/139

 CPU: 0 PID: 139 Comm: kworker/0:2 Tainted: G S                5.19.0-rc2-00004-g7cf2b0f9611b #1
 Hardware name: Hewlett-Packard p6-1451cx/2ADA, BIOS 8.15 02/05/2013
 Workqueue: xfs-inodegc/sdb4 xfs_inodegc_worker [xfs]
 Call Trace:
  <TASK>
 dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
 print_address_description+0x1f/0x200
 print_report.cold (mm/kasan/report.c:430)
 kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493)
 xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs
 xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs
 xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs
 xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs
 xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs
 process_one_work
 worker_thread
 kthread
 ret_from_fork
  </TASK>

 Allocated by task 139:
 kasan_save_stack (mm/kasan/common.c:39)
 __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469)
 kmem_cache_alloc (mm/slab.h:750 mm/slub.c:3214 mm/slub.c:3222 mm/slub.c:3229 mm/slub.c:3239)
 _xfs_buf_alloc (include/linux/instrumented.h:86 include/linux/atomic/atomic-instrumented.h:41 fs/xfs/xfs_buf.c:232) xfs
 xfs_buf_get_map (fs/xfs/xfs_buf.c:660) xfs
 xfs_buf_read_map (fs/xfs/xfs_buf.c:777) xfs
 xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:289) xfs
 xfs_da_read_buf (fs/xfs/libxfs/xfs_da_btree.c:2652) xfs
 xfs_da3_node_read (fs/xfs/libxfs/xfs_da_btree.c:392) xfs
 xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:272) xfs
 xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs
 xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs
 xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs
 process_one_work
 worker_thread
 kthread
 ret_from_fork

 Freed by task 139:
 kasan_save_stack (mm/kasan/common.c:39)
 kasan_set_track (mm/kasan/common.c:45)
 kasan_set_free_info (mm/kasan/generic.c:372)
 __kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374)
 kmem_cache_free (mm/slub.c:1753 mm/slub.c:3507 mm/slub.c:3524)
 xfs_buf_rele (fs/xfs/xfs_buf.c:1040) xfs
 xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:210) xfs
 xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs
 xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs
 xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs
 xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs
 process_one_work
 worker_thread
 kthread
 ret_from_fork

I reproduced this for my own satisfaction, and got the same report,
along with an extra morsel:

 The buggy address belongs to the object at ffff88802103a800
  which belongs to the cache xfs_buf of size 432
 The buggy address is located 396 bytes inside of
  432-byte region [ffff88802103a800, ffff88802103a9b0)

I tracked this code down to:

	error = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
			child_blkno,
			XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0,
			&child_bp);
	if (error)
		return error;
	error = bp->b_error;

That doesn't look right -- I think this should be dereferencing
child_bp, not bp.  Looking through the codebase history, I think this
was added by commit 2911edb653b9 ("xfs: remove the mappedbno argument to
xfs_da_get_buf"), which replaced a call to xfs_da_get_buf with the
current call to xfs_trans_get_buf.  Not sure why we trans_brelse'd @bp
earlier in the function, but I'm guessing it's to avoid pinning too many
buffers in memory while we inactivate the bottom of the attr tree.
Hence we now have to get the buffer back.

I /think/ this was supposed to check child_bp->b_error and fail the rest
of the invalidation if child_bp had experienced any kind of IO or
corruption error.  I bet the xfs_da3_node_read earlier in the loop will
catch most cases of incoming on-disk corruption which makes this check
mostly moot unless someone corrupts the buffer and the AIL pushes it out
to disk while the buffer's unlocked.

In the first case we'll never get to the bad check, and in the second
case the AIL will shut down the log, at which point there's no reason to
check b_error.  Remove the check, and null out @bp to avoid this problem
in the future.

Cc: hch@lst.de
Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: 2911edb653b9 ("xfs: remove the mappedbno argument to xfs_da_get_buf")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_attr_inactive.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2b5da6218977..2afa6d9a7f8f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -158,6 +158,7 @@ xfs_attr3_node_inactive(
 	}
 	child_fsb = be32_to_cpu(ichdr.btree[0].before);
 	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+	bp = NULL;
 
 	/*
 	 * If this is the node level just above the leaves, simply loop
@@ -211,12 +212,8 @@ xfs_attr3_node_inactive(
 				&child_bp);
 		if (error)
 			return error;
-		error = bp->b_error;
-		if (error) {
-			xfs_trans_brelse(*trans, child_bp);
-			return error;
-		}
 		xfs_trans_binval(*trans, child_bp);
+		child_bp = NULL;
 
 		/*
 		 * If we're not done, re-read the parent to get the next
@@ -233,6 +230,7 @@ xfs_attr3_node_inactive(
 						  bp->b_addr);
 			child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
 			xfs_trans_brelse(*trans, bp);
+			bp = NULL;
 		}
 		/*
 		 * Atomically commit the whole invalidate stuff.
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 08/17] xfs: don't leak memory when attr fork loading fails
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (5 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 07/17] xfs: fix use-after-free in xattr node block inactivation Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap() Leah Rumancik
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Dave Chinner, Leah Rumancik, Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit c78c2d0903183a41beb90c56a923e30f90fa91b9 ]

I observed the following evidence of a memory leak while running xfs/399
from the xfs fsck test suite (edited for brevity):

XFS (sde): Metadata corruption detected at xfs_attr_shortform_verify_struct.part.0+0x7b/0xb0 [xfs], inode 0x1172 attr fork
XFS: Assertion failed: ip->i_af.if_u1.if_data == NULL, file: fs/xfs/libxfs/xfs_inode_fork.c, line: 315
------------[ cut here ]------------
WARNING: CPU: 2 PID: 91635 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
CPU: 2 PID: 91635 Comm: xfs_scrub Tainted: G        W         5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:assfail+0x46/0x4a [xfs]
Call Trace:
 <TASK>
 xfs_ifork_zap_attr+0x7c/0xb0
 xfs_iformat_attr_fork+0x86/0x110
 xfs_inode_from_disk+0x41d/0x480
 xfs_iget+0x389/0xd70
 xfs_bulkstat_one_int+0x5b/0x540
 xfs_bulkstat_iwalk+0x1e/0x30
 xfs_iwalk_ag_recs+0xd1/0x160
 xfs_iwalk_run_callbacks+0xb9/0x180
 xfs_iwalk_ag+0x1d8/0x2e0
 xfs_iwalk+0x141/0x220
 xfs_bulkstat+0x105/0x180
 xfs_ioc_bulkstat.constprop.0.isra.0+0xc5/0x130
 xfs_file_ioctl+0xa5f/0xef0
 __x64_sys_ioctl+0x82/0xa0
 do_syscall_64+0x2b/0x80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

This newly-added assertion checks that there aren't any incore data
structures hanging off the incore fork when we're trying to reset its
contents.  From the call trace, it is evident that iget was trying to
construct an incore inode from the ondisk inode, but the attr fork
verifier failed and we were trying to undo all the memory allocations
that we had done earlier.

The three assertions in xfs_ifork_zap_attr check that the caller has
already called xfs_idestroy_fork, which clearly has not been done here.
As the zap function then zeroes the pointers, we've effectively leaked
the memory.

The shortest change would have been to insert an extra call to
xfs_idestroy_fork, but it makes more sense to bundle the _idestroy_fork
call into _zap_attr, since all other callsites call _idestroy_fork
immediately prior to calling _zap_attr.  IOWs, it eliminates one way to
fail.

Note: This change only applies cleanly to 2ed5b09b3e8f, since we just
reworked the attr fork lifetime.  However, I think this memory leak has
existed since 0f45a1b20cd8, since the chain xfs_iformat_attr_fork ->
xfs_iformat_local -> xfs_init_local_fork will allocate
ifp->if_u1.if_data, but if xfs_ifork_verify_local_attr fails,
xfs_iformat_attr_fork will free i_afp without freeing any of the stuff
hanging off i_afp.  The solution for older kernels I think is to add the
missing call to xfs_idestroy_fork just prior to calling kmem_cache_free.

Found by fuzzing a.sfattr.hdr.totsize = lastbit in xfs/399.

[ backport note: did not include refactoring of xfs_idestroy_fork into
xfs_ifork_zap_attr, simply added the missing call as suggested in the
commit for backports ]

Fixes: 2ed5b09b3e8f ("xfs: make inode attribute forks a permanent part of struct xfs_inode")
Probably-Fixes: 0f45a1b20cd8 ("xfs: improve local fork verification")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_fork.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 20095233d7bc..c1f965af8432 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -330,6 +330,7 @@ xfs_iformat_attr_fork(
 	}

 	if (error) {
+		xfs_idestroy_fork(ip->i_afp);
 		kmem_cache_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
 	}
-- 
2.43.0.rc0.421.g78406f8d94-goog

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (6 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 08/17] xfs: don't leak memory when attr fork loading fails Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-20 15:38   ` Greg KH
  2023-11-16  2:28 ` [PATCH 5.15 10/17] xfs: fix intermittent hang during quotacheck Leah Rumancik
                   ` (7 subsequent siblings)
  15 siblings, 1 reply; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, ChenXiaoSong, Guo Xuenan,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: ChenXiaoSong <chenxiaosong2@huawei.com>

[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]

Reproducer:
 1. fallocate -l 100M image
 2. mkfs.xfs -f image
 3. mount image /mnt
 4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
 5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
                   "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
    fd = open("/mnt", O_RDONLY|O_DIRECTORY);
    ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);

NULL pointer dereference will occur when race happens between xfs_getbmap()
and xfs_bmap_set_attrforkoff():

         ioctl               |       setxattr
 ----------------------------|---------------------------
 xfs_getbmap                 |
   xfs_ifork_ptr             |
     xfs_inode_has_attr_fork |
       ip->i_forkoff == 0    |
     return NULL             |
   ifp == NULL               |
                             | xfs_bmap_set_attrforkoff
                             |   ip->i_forkoff > 0
   xfs_inode_has_attr_fork   |
     ip->i_forkoff > 0       |
   ifp == NULL               |
   ifp->if_format            |

Fix this by locking i_lock before xfs_ifork_ptr().

Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: added fixes tag]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fd2ad6a3019c..bea6cc26abf9 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -439,29 +439,28 @@ xfs_getbmap(
 		whichfork = XFS_COW_FORK;
 	else
 		whichfork = XFS_DATA_FORK;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	switch (whichfork) {
 	case XFS_ATTR_FORK:
+		lock = xfs_ilock_attr_map_shared(ip);
 		if (!XFS_IFORK_Q(ip))
-			goto out_unlock_iolock;
+			goto out_unlock_ilock;
 
 		max_len = 1LL << 32;
-		lock = xfs_ilock_attr_map_shared(ip);
 		break;
 	case XFS_COW_FORK:
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
+
 		/* No CoW fork? Just return */
-		if (!ifp)
-			goto out_unlock_iolock;
+		if (!XFS_IFORK_PTR(ip, whichfork))
+			goto out_unlock_ilock;
 
 		if (xfs_get_cowextsz_hint(ip))
 			max_len = mp->m_super->s_maxbytes;
 		else
 			max_len = XFS_ISIZE(ip);
-
-		lock = XFS_ILOCK_SHARED;
-		xfs_ilock(ip, lock);
 		break;
 	case XFS_DATA_FORK:
 		if (!(iflags & BMV_IF_DELALLOC) &&
@@ -491,6 +490,8 @@ xfs_getbmap(
 		break;
 	}
 
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
 	switch (ifp->if_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 	case XFS_DINODE_FMT_BTREE:
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-16  2:28 ` [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap() Leah Rumancik
@ 2023-11-20 15:38   ` Greg KH
  2023-11-20 19:11     ` Darrick J. Wong
  2023-11-21  2:50     ` Sasha Levin
  0 siblings, 2 replies; 22+ messages in thread
From: Greg KH @ 2023-11-20 15:38 UTC (permalink / raw)
  To: Leah Rumancik
  Cc: stable, linux-xfs, amir73il, chandan.babu, fred, ChenXiaoSong,
	Guo Xuenan, Darrick J . Wong, Chandan Babu R

On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
> From: ChenXiaoSong <chenxiaosong2@huawei.com>
> 
> [ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
> 
> Reproducer:
>  1. fallocate -l 100M image
>  2. mkfs.xfs -f image
>  3. mount image /mnt
>  4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
>  5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
>                    "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
>     fd = open("/mnt", O_RDONLY|O_DIRECTORY);
>     ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
> 
> NULL pointer dereference will occur when race happens between xfs_getbmap()
> and xfs_bmap_set_attrforkoff():
> 
>          ioctl               |       setxattr
>  ----------------------------|---------------------------
>  xfs_getbmap                 |
>    xfs_ifork_ptr             |
>      xfs_inode_has_attr_fork |
>        ip->i_forkoff == 0    |
>      return NULL             |
>    ifp == NULL               |
>                              | xfs_bmap_set_attrforkoff
>                              |   ip->i_forkoff > 0
>    xfs_inode_has_attr_fork   |
>      ip->i_forkoff > 0       |
>    ifp == NULL               |
>    ifp->if_format            |
> 
> Fix this by locking i_lock before xfs_ifork_ptr().
> 
> Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
> Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
> Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> [djwong: added fixes tag]
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> Acked-by: Chandan Babu R <chandanbabu@kernel.org>
> ---
>  fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
>  1 file changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index fd2ad6a3019c..bea6cc26abf9 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -439,29 +439,28 @@ xfs_getbmap(
>  		whichfork = XFS_COW_FORK;
>  	else
>  		whichfork = XFS_DATA_FORK;
> -	ifp = XFS_IFORK_PTR(ip, whichfork);
>  
>  	xfs_ilock(ip, XFS_IOLOCK_SHARED);
>  	switch (whichfork) {
>  	case XFS_ATTR_FORK:
> +		lock = xfs_ilock_attr_map_shared(ip);
>  		if (!XFS_IFORK_Q(ip))
> -			goto out_unlock_iolock;
> +			goto out_unlock_ilock;
>  
>  		max_len = 1LL << 32;
> -		lock = xfs_ilock_attr_map_shared(ip);
>  		break;
>  	case XFS_COW_FORK:
> +		lock = XFS_ILOCK_SHARED;
> +		xfs_ilock(ip, lock);
> +
>  		/* No CoW fork? Just return */
> -		if (!ifp)
> -			goto out_unlock_iolock;
> +		if (!XFS_IFORK_PTR(ip, whichfork))
> +			goto out_unlock_ilock;
>  
>  		if (xfs_get_cowextsz_hint(ip))
>  			max_len = mp->m_super->s_maxbytes;
>  		else
>  			max_len = XFS_ISIZE(ip);
> -
> -		lock = XFS_ILOCK_SHARED;
> -		xfs_ilock(ip, lock);
>  		break;
>  	case XFS_DATA_FORK:
>  		if (!(iflags & BMV_IF_DELALLOC) &&
> @@ -491,6 +490,8 @@ xfs_getbmap(
>  		break;
>  	}
>  
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
> +
>  	switch (ifp->if_format) {
>  	case XFS_DINODE_FMT_EXTENTS:
>  	case XFS_DINODE_FMT_BTREE:
> -- 
> 2.43.0.rc0.421.g78406f8d94-goog
> 

This patch breaks the build, how was it tested?

fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’:
fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address]
  457 |                 if (!XFS_IFORK_PTR(ip, whichfork))
      |                     ^
In file included from fs/xfs/xfs_bmap_util.c:16:
fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here
   38 |         struct xfs_ifork        i_df;           /* data fork */
      |                                 ^~~~
cc1: all warnings being treated as errors



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-20 15:38   ` Greg KH
@ 2023-11-20 19:11     ` Darrick J. Wong
  2023-11-21  5:18       ` Greg KH
  2023-11-21  2:50     ` Sasha Levin
  1 sibling, 1 reply; 22+ messages in thread
From: Darrick J. Wong @ 2023-11-20 19:11 UTC (permalink / raw)
  To: Greg KH
  Cc: Leah Rumancik, stable, linux-xfs, amir73il, chandan.babu, fred,
	ChenXiaoSong, Guo Xuenan, Chandan Babu R

On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
> On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
> > From: ChenXiaoSong <chenxiaosong2@huawei.com>
> > 
> > [ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
> > 
> > Reproducer:
> >  1. fallocate -l 100M image
> >  2. mkfs.xfs -f image
> >  3. mount image /mnt
> >  4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
> >  5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
> >                    "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
> >     fd = open("/mnt", O_RDONLY|O_DIRECTORY);
> >     ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
> > 
> > NULL pointer dereference will occur when race happens between xfs_getbmap()
> > and xfs_bmap_set_attrforkoff():
> > 
> >          ioctl               |       setxattr
> >  ----------------------------|---------------------------
> >  xfs_getbmap                 |
> >    xfs_ifork_ptr             |
> >      xfs_inode_has_attr_fork |
> >        ip->i_forkoff == 0    |
> >      return NULL             |
> >    ifp == NULL               |
> >                              | xfs_bmap_set_attrforkoff
> >                              |   ip->i_forkoff > 0
> >    xfs_inode_has_attr_fork   |
> >      ip->i_forkoff > 0       |
> >    ifp == NULL               |
> >    ifp->if_format            |
> > 
> > Fix this by locking i_lock before xfs_ifork_ptr().
> > 
> > Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
> > Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
> > Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
> > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > [djwong: added fixes tag]
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> > Acked-by: Chandan Babu R <chandanbabu@kernel.org>
> > ---
> >  fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
> >  1 file changed, 9 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> > index fd2ad6a3019c..bea6cc26abf9 100644
> > --- a/fs/xfs/xfs_bmap_util.c
> > +++ b/fs/xfs/xfs_bmap_util.c
> > @@ -439,29 +439,28 @@ xfs_getbmap(
> >  		whichfork = XFS_COW_FORK;
> >  	else
> >  		whichfork = XFS_DATA_FORK;
> > -	ifp = XFS_IFORK_PTR(ip, whichfork);
> >  
> >  	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> >  	switch (whichfork) {
> >  	case XFS_ATTR_FORK:
> > +		lock = xfs_ilock_attr_map_shared(ip);
> >  		if (!XFS_IFORK_Q(ip))
> > -			goto out_unlock_iolock;
> > +			goto out_unlock_ilock;
> >  
> >  		max_len = 1LL << 32;
> > -		lock = xfs_ilock_attr_map_shared(ip);
> >  		break;
> >  	case XFS_COW_FORK:
> > +		lock = XFS_ILOCK_SHARED;
> > +		xfs_ilock(ip, lock);
> > +
> >  		/* No CoW fork? Just return */
> > -		if (!ifp)
> > -			goto out_unlock_iolock;
> > +		if (!XFS_IFORK_PTR(ip, whichfork))

Is this the line 457 that the compiler complains about below?

If so, then whichfork == XFS_COW_FORK here, because the code just
switch()d on that.  The XFS_IFORK_PTR macro decomposes into:

#define XFS_IFORK_PTR(ip,w)		\
	((w) == XFS_DATA_FORK ? \
		&(ip)->i_df : \
		((w) == XFS_ATTR_FORK ? \
			(ip)->i_afp : \
			(ip)->i_cowfp))

which means this test /should/ be turning into:

		if (!(ip->i_cowfp))
			goto out_unlock_ilock;

I'm not sure why your compiler is whining about &ip->i_df; that's not
what's being selected here for testing.  Unless your compiler is somehow
deciding that XFS_DATA_FORK == XFS_COW_FORK?  This should not ever be
possible.

--D

> > +			goto out_unlock_ilock;
> >  
> >  		if (xfs_get_cowextsz_hint(ip))
> >  			max_len = mp->m_super->s_maxbytes;
> >  		else
> >  			max_len = XFS_ISIZE(ip);
> > -
> > -		lock = XFS_ILOCK_SHARED;
> > -		xfs_ilock(ip, lock);
> >  		break;
> >  	case XFS_DATA_FORK:
> >  		if (!(iflags & BMV_IF_DELALLOC) &&
> > @@ -491,6 +490,8 @@ xfs_getbmap(
> >  		break;
> >  	}
> >  
> > +	ifp = XFS_IFORK_PTR(ip, whichfork);
> > +
> >  	switch (ifp->if_format) {
> >  	case XFS_DINODE_FMT_EXTENTS:
> >  	case XFS_DINODE_FMT_BTREE:
> > -- 
> > 2.43.0.rc0.421.g78406f8d94-goog
> > 
> 
> This patch breaks the build, how was it tested?
> 
> fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’:
> fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address]
>   457 |                 if (!XFS_IFORK_PTR(ip, whichfork))
>       |                     ^
> In file included from fs/xfs/xfs_bmap_util.c:16:
> fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here
>    38 |         struct xfs_ifork        i_df;           /* data fork */
>       |                                 ^~~~
> cc1: all warnings being treated as errors
> 
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-20 19:11     ` Darrick J. Wong
@ 2023-11-21  5:18       ` Greg KH
  0 siblings, 0 replies; 22+ messages in thread
From: Greg KH @ 2023-11-21  5:18 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Leah Rumancik, stable, linux-xfs, amir73il, chandan.babu, fred,
	ChenXiaoSong, Guo Xuenan, Chandan Babu R

On Mon, Nov 20, 2023 at 11:11:30AM -0800, Darrick J. Wong wrote:
> On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
> > On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
> > > From: ChenXiaoSong <chenxiaosong2@huawei.com>
> > > 
> > > [ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
> > > 
> > > Reproducer:
> > >  1. fallocate -l 100M image
> > >  2. mkfs.xfs -f image
> > >  3. mount image /mnt
> > >  4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
> > >  5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
> > >                    "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
> > >     fd = open("/mnt", O_RDONLY|O_DIRECTORY);
> > >     ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
> > > 
> > > NULL pointer dereference will occur when race happens between xfs_getbmap()
> > > and xfs_bmap_set_attrforkoff():
> > > 
> > >          ioctl               |       setxattr
> > >  ----------------------------|---------------------------
> > >  xfs_getbmap                 |
> > >    xfs_ifork_ptr             |
> > >      xfs_inode_has_attr_fork |
> > >        ip->i_forkoff == 0    |
> > >      return NULL             |
> > >    ifp == NULL               |
> > >                              | xfs_bmap_set_attrforkoff
> > >                              |   ip->i_forkoff > 0
> > >    xfs_inode_has_attr_fork   |
> > >      ip->i_forkoff > 0       |
> > >    ifp == NULL               |
> > >    ifp->if_format            |
> > > 
> > > Fix this by locking i_lock before xfs_ifork_ptr().
> > > 
> > > Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
> > > Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
> > > Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > [djwong: added fixes tag]
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> > > Acked-by: Chandan Babu R <chandanbabu@kernel.org>
> > > ---
> > >  fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
> > >  1 file changed, 9 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> > > index fd2ad6a3019c..bea6cc26abf9 100644
> > > --- a/fs/xfs/xfs_bmap_util.c
> > > +++ b/fs/xfs/xfs_bmap_util.c
> > > @@ -439,29 +439,28 @@ xfs_getbmap(
> > >  		whichfork = XFS_COW_FORK;
> > >  	else
> > >  		whichfork = XFS_DATA_FORK;
> > > -	ifp = XFS_IFORK_PTR(ip, whichfork);
> > >  
> > >  	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> > >  	switch (whichfork) {
> > >  	case XFS_ATTR_FORK:
> > > +		lock = xfs_ilock_attr_map_shared(ip);
> > >  		if (!XFS_IFORK_Q(ip))
> > > -			goto out_unlock_iolock;
> > > +			goto out_unlock_ilock;
> > >  
> > >  		max_len = 1LL << 32;
> > > -		lock = xfs_ilock_attr_map_shared(ip);
> > >  		break;
> > >  	case XFS_COW_FORK:
> > > +		lock = XFS_ILOCK_SHARED;
> > > +		xfs_ilock(ip, lock);
> > > +
> > >  		/* No CoW fork? Just return */
> > > -		if (!ifp)
> > > -			goto out_unlock_iolock;
> > > +		if (!XFS_IFORK_PTR(ip, whichfork))
> 
> Is this the line 457 that the compiler complains about below?
> 
> If so, then whichfork == XFS_COW_FORK here, because the code just
> switch()d on that.  The XFS_IFORK_PTR macro decomposes into:
> 
> #define XFS_IFORK_PTR(ip,w)		\
> 	((w) == XFS_DATA_FORK ? \
> 		&(ip)->i_df : \
> 		((w) == XFS_ATTR_FORK ? \
> 			(ip)->i_afp : \
> 			(ip)->i_cowfp))
> 
> which means this test /should/ be turning into:
> 
> 		if (!(ip->i_cowfp))
> 			goto out_unlock_ilock;
> 
> I'm not sure why your compiler is whining about &ip->i_df; that's not
> what's being selected here for testing.  Unless your compiler is somehow
> deciding that XFS_DATA_FORK == XFS_COW_FORK?  This should not ever be
> possible.

This is using gcc-12, and gcc-13, no idea what happened, just that it
throws up that warning which stops my builds :(

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-20 15:38   ` Greg KH
  2023-11-20 19:11     ` Darrick J. Wong
@ 2023-11-21  2:50     ` Sasha Levin
  2023-11-21  5:17       ` Greg KH
  1 sibling, 1 reply; 22+ messages in thread
From: Sasha Levin @ 2023-11-21  2:50 UTC (permalink / raw)
  To: Greg KH
  Cc: Leah Rumancik, stable, linux-xfs, amir73il, chandan.babu, fred,
	ChenXiaoSong, Guo Xuenan, Darrick J . Wong, Chandan Babu R

On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
>On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
>> From: ChenXiaoSong <chenxiaosong2@huawei.com>
>>
>> [ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
>>
>> Reproducer:
>>  1. fallocate -l 100M image
>>  2. mkfs.xfs -f image
>>  3. mount image /mnt
>>  4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
>>  5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
>>                    "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
>>     fd = open("/mnt", O_RDONLY|O_DIRECTORY);
>>     ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
>>
>> NULL pointer dereference will occur when race happens between xfs_getbmap()
>> and xfs_bmap_set_attrforkoff():
>>
>>          ioctl               |       setxattr
>>  ----------------------------|---------------------------
>>  xfs_getbmap                 |
>>    xfs_ifork_ptr             |
>>      xfs_inode_has_attr_fork |
>>        ip->i_forkoff == 0    |
>>      return NULL             |
>>    ifp == NULL               |
>>                              | xfs_bmap_set_attrforkoff
>>                              |   ip->i_forkoff > 0
>>    xfs_inode_has_attr_fork   |
>>      ip->i_forkoff > 0       |
>>    ifp == NULL               |
>>    ifp->if_format            |
>>
>> Fix this by locking i_lock before xfs_ifork_ptr().
>>
>> Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
>> Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
>> Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
>> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
>> [djwong: added fixes tag]
>> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
>> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
>> Acked-by: Chandan Babu R <chandanbabu@kernel.org>
>> ---
>>  fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
>>  1 file changed, 9 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index fd2ad6a3019c..bea6cc26abf9 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -439,29 +439,28 @@ xfs_getbmap(
>>  		whichfork = XFS_COW_FORK;
>>  	else
>>  		whichfork = XFS_DATA_FORK;
>> -	ifp = XFS_IFORK_PTR(ip, whichfork);
>>
>>  	xfs_ilock(ip, XFS_IOLOCK_SHARED);
>>  	switch (whichfork) {
>>  	case XFS_ATTR_FORK:
>> +		lock = xfs_ilock_attr_map_shared(ip);
>>  		if (!XFS_IFORK_Q(ip))
>> -			goto out_unlock_iolock;
>> +			goto out_unlock_ilock;
>>
>>  		max_len = 1LL << 32;
>> -		lock = xfs_ilock_attr_map_shared(ip);
>>  		break;
>>  	case XFS_COW_FORK:
>> +		lock = XFS_ILOCK_SHARED;
>> +		xfs_ilock(ip, lock);
>> +
>>  		/* No CoW fork? Just return */
>> -		if (!ifp)
>> -			goto out_unlock_iolock;
>> +		if (!XFS_IFORK_PTR(ip, whichfork))
>> +			goto out_unlock_ilock;
>>
>>  		if (xfs_get_cowextsz_hint(ip))
>>  			max_len = mp->m_super->s_maxbytes;
>>  		else
>>  			max_len = XFS_ISIZE(ip);
>> -
>> -		lock = XFS_ILOCK_SHARED;
>> -		xfs_ilock(ip, lock);
>>  		break;
>>  	case XFS_DATA_FORK:
>>  		if (!(iflags & BMV_IF_DELALLOC) &&
>> @@ -491,6 +490,8 @@ xfs_getbmap(
>>  		break;
>>  	}
>>
>> +	ifp = XFS_IFORK_PTR(ip, whichfork);
>> +
>>  	switch (ifp->if_format) {
>>  	case XFS_DINODE_FMT_EXTENTS:
>>  	case XFS_DINODE_FMT_BTREE:
>> --
>> 2.43.0.rc0.421.g78406f8d94-goog
>>
>
>This patch breaks the build, how was it tested?
>
>fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’:
>fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address]
>  457 |                 if (!XFS_IFORK_PTR(ip, whichfork))
>      |                     ^
>In file included from fs/xfs/xfs_bmap_util.c:16:
>fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here
>   38 |         struct xfs_ifork        i_df;           /* data fork */
>      |                                 ^~~~
>cc1: all warnings being treated as errors

That's odd. I actually ended up queueing these patches earlier, and I
don't see any such warnings.

Looking at the code, this is a bit weird too - do you see these warnings
with the current 5.15 queue?

-- 
Thanks,
Sasha

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap()
  2023-11-21  2:50     ` Sasha Levin
@ 2023-11-21  5:17       ` Greg KH
  0 siblings, 0 replies; 22+ messages in thread
From: Greg KH @ 2023-11-21  5:17 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Leah Rumancik, stable, linux-xfs, amir73il, chandan.babu, fred,
	ChenXiaoSong, Guo Xuenan, Darrick J . Wong, Chandan Babu R

On Mon, Nov 20, 2023 at 09:50:45PM -0500, Sasha Levin wrote:
> On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
> > On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
> > > From: ChenXiaoSong <chenxiaosong2@huawei.com>
> > > 
> > > [ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
> > > 
> > > Reproducer:
> > >  1. fallocate -l 100M image
> > >  2. mkfs.xfs -f image
> > >  3. mount image /mnt
> > >  4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
> > >  5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00"
> > >                    "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7";
> > >     fd = open("/mnt", O_RDONLY|O_DIRECTORY);
> > >     ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
> > > 
> > > NULL pointer dereference will occur when race happens between xfs_getbmap()
> > > and xfs_bmap_set_attrforkoff():
> > > 
> > >          ioctl               |       setxattr
> > >  ----------------------------|---------------------------
> > >  xfs_getbmap                 |
> > >    xfs_ifork_ptr             |
> > >      xfs_inode_has_attr_fork |
> > >        ip->i_forkoff == 0    |
> > >      return NULL             |
> > >    ifp == NULL               |
> > >                              | xfs_bmap_set_attrforkoff
> > >                              |   ip->i_forkoff > 0
> > >    xfs_inode_has_attr_fork   |
> > >      ip->i_forkoff > 0       |
> > >    ifp == NULL               |
> > >    ifp->if_format            |
> > > 
> > > Fix this by locking i_lock before xfs_ifork_ptr().
> > > 
> > > Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers")
> > > Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
> > > Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
> > > Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > > [djwong: added fixes tag]
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> > > Acked-by: Chandan Babu R <chandanbabu@kernel.org>
> > > ---
> > >  fs/xfs/xfs_bmap_util.c | 17 +++++++++--------
> > >  1 file changed, 9 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> > > index fd2ad6a3019c..bea6cc26abf9 100644
> > > --- a/fs/xfs/xfs_bmap_util.c
> > > +++ b/fs/xfs/xfs_bmap_util.c
> > > @@ -439,29 +439,28 @@ xfs_getbmap(
> > >  		whichfork = XFS_COW_FORK;
> > >  	else
> > >  		whichfork = XFS_DATA_FORK;
> > > -	ifp = XFS_IFORK_PTR(ip, whichfork);
> > > 
> > >  	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> > >  	switch (whichfork) {
> > >  	case XFS_ATTR_FORK:
> > > +		lock = xfs_ilock_attr_map_shared(ip);
> > >  		if (!XFS_IFORK_Q(ip))
> > > -			goto out_unlock_iolock;
> > > +			goto out_unlock_ilock;
> > > 
> > >  		max_len = 1LL << 32;
> > > -		lock = xfs_ilock_attr_map_shared(ip);
> > >  		break;
> > >  	case XFS_COW_FORK:
> > > +		lock = XFS_ILOCK_SHARED;
> > > +		xfs_ilock(ip, lock);
> > > +
> > >  		/* No CoW fork? Just return */
> > > -		if (!ifp)
> > > -			goto out_unlock_iolock;
> > > +		if (!XFS_IFORK_PTR(ip, whichfork))
> > > +			goto out_unlock_ilock;
> > > 
> > >  		if (xfs_get_cowextsz_hint(ip))
> > >  			max_len = mp->m_super->s_maxbytes;
> > >  		else
> > >  			max_len = XFS_ISIZE(ip);
> > > -
> > > -		lock = XFS_ILOCK_SHARED;
> > > -		xfs_ilock(ip, lock);
> > >  		break;
> > >  	case XFS_DATA_FORK:
> > >  		if (!(iflags & BMV_IF_DELALLOC) &&
> > > @@ -491,6 +490,8 @@ xfs_getbmap(
> > >  		break;
> > >  	}
> > > 
> > > +	ifp = XFS_IFORK_PTR(ip, whichfork);
> > > +
> > >  	switch (ifp->if_format) {
> > >  	case XFS_DINODE_FMT_EXTENTS:
> > >  	case XFS_DINODE_FMT_BTREE:
> > > --
> > > 2.43.0.rc0.421.g78406f8d94-goog
> > > 
> > 
> > This patch breaks the build, how was it tested?
> > 
> > fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’:
> > fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address]
> >  457 |                 if (!XFS_IFORK_PTR(ip, whichfork))
> >      |                     ^
> > In file included from fs/xfs/xfs_bmap_util.c:16:
> > fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here
> >   38 |         struct xfs_ifork        i_df;           /* data fork */
> >      |                                 ^~~~
> > cc1: all warnings being treated as errors
> 
> That's odd. I actually ended up queueing these patches earlier, and I
> don't see any such warnings.
> 
> Looking at the code, this is a bit weird too - do you see these warnings
> with the current 5.15 queue?

I did, that's where I saw this, so I dropped this commit from there, it
failed my builds using gcc-12.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 5.15 10/17] xfs: fix intermittent hang during quotacheck
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (7 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap() Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 11/17] xfs: add missing cmap->br_state = XFS_EXT_NORM update Leah Rumancik
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Dave Chinner, Leah Rumancik, Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit f0c2d7d2abca24d19831c99edea458704fac8087 ]

Every now and then, I see the following hang during mount time
quotacheck when running fstests.  Turning on KASAN seems to make it
happen somewhat more frequently.  I've edited the backtrace for brevity.

XFS (sdd): Quotacheck needed: Please wait.
XFS: Assertion failed: bp->b_flags & _XBF_DELWRI_Q, file: fs/xfs/xfs_buf.c, line: 2411
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1831409 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
CPU: 0 PID: 1831409 Comm: mount Tainted: G        W         5.19.0-rc6-xfsx #rc6 09911566947b9f737b036b4af85e399e4b9aef64
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:assfail+0x46/0x4a [xfs]
Code: a0 8f 41 a0 e8 45 fe ff ff 8a 1d 2c 36 10 00 80 fb 01 76 0f 0f b6 f3 48 c7 c7 c0 f0 4f a0 e8 10 f0 02 e1 80 e3 01 74 02 0f 0b <0f> 0b 5b c3 48 8d 45 10 48 89 e2 4c 89 e6 48 89 1c 24 48 89 44 24
RSP: 0018:ffffc900078c7b30 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8880099ac000 RCX: 000000007fffffff
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa0418fa0
RBP: ffff8880197bc1c0 R08: 0000000000000000 R09: 000000000000000a
R10: 000000000000000a R11: f000000000000000 R12: ffffc900078c7d20
R13: 00000000fffffff5 R14: ffffc900078c7d20 R15: 0000000000000000
FS:  00007f0449903800(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005610ada631f0 CR3: 0000000014dd8002 CR4: 00000000001706f0
Call Trace:
 <TASK>
 xfs_buf_delwri_pushbuf+0x150/0x160 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_qm_flush_one+0xd6/0x130 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_qm_dquot_walk.isra.0+0x109/0x1e0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_qm_quotacheck+0x319/0x490 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_qm_mount_quotas+0x65/0x2c0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_mountfs+0x6b5/0xab0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 xfs_fs_fill_super+0x781/0x990 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368]
 get_tree_bdev+0x175/0x280
 vfs_get_tree+0x1a/0x80
 path_mount+0x6f5/0xaa0
 __x64_sys_mount+0x103/0x140
 do_syscall_64+0x2b/0x80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0

I /think/ this can happen if xfs_qm_flush_one is racing with
xfs_qm_dquot_isolate (i.e. dquot reclaim) when the second function has
taken the dquot flush lock but xfs_qm_dqflush hasn't yet locked the
dquot buffer, let alone queued it to the delwri list.  In this case,
flush_one will fail to get the dquot flush lock, but it can lock the
incore buffer, but xfs_buf_delwri_pushbuf will then trip over this
ASSERT, which checks that the buffer isn't on a delwri list.  The hang
results because the _delwri_submit_buffers ignores non DELWRI_Q buffers,
which means that xfs_buf_iowait waits forever for an IO that has not yet
been scheduled.

AFAICT, a reasonable solution here is to detect a dquot buffer that is
not on a DELWRI list, drop it, and return -EAGAIN to try the flush
again.  It's not /that/ big of a deal if quotacheck writes the dquot
buffer repeatedly before we even set QUOTA_CHKD.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_qm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 623244650a2f..792736e29a37 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1244,6 +1244,13 @@ xfs_qm_flush_one(
 			error = -EINVAL;
 			goto out_unlock;
 		}
+
+		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+			error = -EAGAIN;
+			xfs_buf_relse(bp);
+			goto out_unlock;
+		}
+
 		xfs_buf_unlock(bp);
 
 		xfs_buf_delwri_pushbuf(bp, buffer_list);
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 11/17] xfs: add missing cmap->br_state = XFS_EXT_NORM update
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (8 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 10/17] xfs: fix intermittent hang during quotacheck Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 12/17] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork Leah Rumancik
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Gao Xiang,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Gao Xiang <hsiangkao@linux.alibaba.com>

[ Upstream commit 1a39ae415c1be1e46f5b3f97d438c7c4adc22b63 ]

COW extents are already converted into written real extents after
xfs_reflink_convert_cow_locked(), therefore cmap->br_state should
reflect it.

Otherwise, there is another necessary unwritten convertion
triggered in xfs_dio_write_end_io() for direct I/O cases.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_reflink.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 36832e4bc803..628ce65d02bb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -425,7 +425,10 @@ xfs_reflink_allocate_cow(
 	if (!convert_now || cmap->br_state == XFS_EXT_NORM)
 		return 0;
 	trace_xfs_reflink_convert_cow(ip, cmap);
-	return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	if (!error)
+		cmap->br_state = XFS_EXT_NORM;
+	return error;
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 12/17] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (9 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 11/17] xfs: add missing cmap->br_state = XFS_EXT_NORM update Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 13/17] xfs: fix inode reservation space for removing transaction Leah Rumancik
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Wengang Wang,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Chandan Babu R <chandan.babu@oracle.com>

[ Upstream commit d62113303d691bcd8d0675ae4ac63e7769afc56c ]

On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error
even though the filesystem has sufficient number of free blocks.

This occurs if the file offset range on which the write operation is being
performed has a delalloc extent in the cow fork and this delalloc extent
begins much before the Direct IO range.

In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to
allocate the blocks mapped by the delalloc extent. The extent thus allocated
may not cover the beginning of file offset range on which the Direct IO write
was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC.

The following script reliably recreates the bug described above.

  #!/usr/bin/bash

  device=/dev/loop0
  shortdev=$(basename $device)

  mntpnt=/mnt/
  file1=${mntpnt}/file1
  file2=${mntpnt}/file2
  fragmentedfile=${mntpnt}/fragmentedfile
  punchprog=/root/repos/xfstests-dev/src/punch-alternating

  errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent

  umount $device > /dev/null 2>&1

  echo "Create FS"
  mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1
  if [[ $? != 0 ]]; then
  	echo "mkfs failed."
  	exit 1
  fi

  echo "Mount FS"
  mount $device $mntpnt > /dev/null 2>&1
  if [[ $? != 0 ]]; then
  	echo "mount failed."
  	exit 1
  fi

  echo "Create source file"
  xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1

  sync

  echo "Create Reflinked file"
  xfs_io -f -c "reflink $file1" $file2 &>/dev/null

  echo "Set cowextsize"
  xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1

  echo "Fragment FS"
  xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1
  sync
  $punchprog $fragmentedfile

  echo "Allocate block sized extent from now onwards"
  echo -n 1 > $errortag

  echo "Create 16MiB delalloc extent in CoW fork"
  xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1

  sync

  echo "Direct I/O write at offset 12k"
  xfs_io -d -c "pwrite 12k 8k" $file1

This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk
blocks are allocated for atleast the starting file offset of the Direct IO
write range.

Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin")
Reported-and-Root-caused-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: slight editing to make the locking less grody, and fix some style things]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_reflink.c | 198 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 163 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 628ce65d02bb..793bdf5ac2f7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -340,9 +340,41 @@ xfs_find_trim_cow_extent(
 	return 0;
 }
 
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			convert_now)
+{
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	int			error;
+
+	/*
+	 * cmap might larger than imap due to cowextsize hint.
+	 */
+	xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+	/*
+	 * COW fork extents are supposed to remain unwritten until we're ready
+	 * to initiate a disk write.  For direct I/O we are going to write the
+	 * data and need the conversion, but for buffered writes we're done.
+	 */
+	if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+		return 0;
+
+	trace_xfs_reflink_convert_cow(ip, cmap);
+
+	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+	if (!error)
+		cmap->br_state = XFS_EXT_NORM;
+
+	return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	struct xfs_bmbt_irec	*cmap,
@@ -351,25 +383,12 @@ xfs_reflink_allocate_cow(
 	bool			convert_now)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb = imap->br_startoff;
-	xfs_filblks_t		count_fsb = imap->br_blockcount;
 	struct xfs_trans	*tp;
-	int			nimaps, error = 0;
-	bool			found;
 	xfs_filblks_t		resaligned;
-	xfs_extlen_t		resblks = 0;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	if (!ip->i_cowfp) {
-		ASSERT(!xfs_is_reflink_inode(ip));
-		xfs_ifork_init_cow(ip);
-	}
-
-	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
-	if (error || !*shared)
-		return error;
-	if (found)
-		goto convert;
+	xfs_extlen_t		resblks;
+	int			nimaps;
+	int			error;
+	bool			found;
 
 	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -385,17 +404,17 @@ xfs_reflink_allocate_cow(
 
 	*lockmode = XFS_ILOCK_EXCL;
 
-	/*
-	 * Check for an overlapping extent again now that we dropped the ilock.
-	 */
 	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
 	if (error || !*shared)
 		goto out_trans_cancel;
+
 	if (found) {
 		xfs_trans_cancel(tp);
 		goto convert;
 	}
 
+	ASSERT(cmap->br_startoff > imap->br_startoff);
+
 	/* Allocate the entire reservation as unwritten blocks. */
 	nimaps = 1;
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -415,26 +434,135 @@ xfs_reflink_allocate_cow(
 	 */
 	if (nimaps == 0)
 		return -ENOSPC;
+
 convert:
-	xfs_trim_extent(cmap, offset_fsb, count_fsb);
-	/*
-	 * COW fork extents are supposed to remain unwritten until we're ready
-	 * to initiate a disk write.  For direct I/O we are going to write the
-	 * data and need the conversion, but for buffered writes we're done.
-	 */
-	if (!convert_now || cmap->br_state == XFS_EXT_NORM)
-		return 0;
-	trace_xfs_reflink_convert_cow(ip, cmap);
-	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
-	if (!error)
-		cmap->br_state = XFS_EXT_NORM;
+	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
 	return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	uint			*lockmode,
+	bool			convert_now)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			nimaps;
+	int			error;
+	bool			found;
+
+	do {
+		xfs_iunlock(ip, *lockmode);
+		*lockmode = 0;
+
+		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+				false, &tp);
+		if (error)
+			return error;
+
+		*lockmode = XFS_ILOCK_EXCL;
+
+		error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+				&found);
+		if (error || !*shared)
+			goto out_trans_cancel;
+
+		if (found) {
+			xfs_trans_cancel(tp);
+			break;
+		}
+
+		ASSERT(isnullstartblock(cmap->br_startblock) ||
+		       cmap->br_startblock == DELAYSTARTBLOCK);
+
+		/*
+		 * Replace delalloc reservation with an unwritten extent.
+		 */
+		nimaps = 1;
+		error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+				cmap->br_blockcount,
+				XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+				cmap, &nimaps);
+		if (error)
+			goto out_trans_cancel;
+
+		xfs_inode_set_cowblocks_tag(ip);
+		error = xfs_trans_commit(tp);
+		if (error)
+			return error;
+
+		/*
+		 * Allocation succeeded but the requested range was not even
+		 * partially satisfied?  Bail out!
+		 */
+		if (nimaps == 0)
+			return -ENOSPC;
+	} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
 	return error;
 }
 
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*imap,
+	struct xfs_bmbt_irec	*cmap,
+	bool			*shared,
+	uint			*lockmode,
+	bool			convert_now)
+{
+	int			error;
+	bool			found;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	if (!ip->i_cowfp) {
+		ASSERT(!xfs_is_reflink_inode(ip));
+		xfs_ifork_init_cow(ip);
+	}
+
+	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+	if (error || !*shared)
+		return error;
+
+	/* CoW fork has a real extent */
+	if (found)
+		return xfs_reflink_convert_unwritten(ip, imap, cmap,
+				convert_now);
+
+	/*
+	 * CoW fork does not have an extent and data extent is shared.
+	 * Allocate a real extent in the CoW fork.
+	 */
+	if (cmap->br_startoff > imap->br_startoff)
+		return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+				lockmode, convert_now);
+
+	/*
+	 * CoW fork has a delalloc reservation. Replace it with a real extent.
+	 * There may or may not be a data fork mapping.
+	 */
+	if (isnullstartblock(cmap->br_startblock) ||
+	    cmap->br_startblock == DELAYSTARTBLOCK)
+		return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+				lockmode, convert_now);
+
+	/* Shouldn't get here. */
+	ASSERT(0);
+	return -EFSCORRUPTED;
+}
+
 /*
  * Cancel CoW reservations for some block range of an inode.
  *
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 13/17] xfs: fix inode reservation space for removing transaction
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (10 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 12/17] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 14/17] xfs: avoid a UAF when log intent item recovery fails Leah Rumancik
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, hexiaole,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: hexiaole <hexiaole@kylinos.cn>

[ Upstream commit 031d166f968efba6e4f091ff75d0bb5206bb3918 ]

In 'fs/xfs/libxfs/xfs_trans_resv.c', the comment for transaction of removing a
directory entry writes:

/* fs/xfs/libxfs/xfs_trans_resv.c begin */
/*
 * For removing a directory entry we can modify:
 *    the parent directory inode: inode size
 *    the removed inode: inode size
...
xfs_calc_remove_reservation(
        struct xfs_mount        *mp)
{
        return XFS_DQUOT_LOGRES(mp) +
                xfs_calc_iunlink_add_reservation(mp) +
                max((xfs_calc_inode_res(mp, 1) +
...
/* fs/xfs/libxfs/xfs_trans_resv.c end */

There has 2 inode size of space to be reserverd, but the actual code
for inode reservation space writes.

There only count for 1 inode size to be reserved in
'xfs_calc_inode_res(mp, 1)', rather than 2.

Signed-off-by: hexiaole <hexiaole@kylinos.cn>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: remove redundant code citations]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_trans_resv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5e300daa2559..2db9d9d12344 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -423,7 +423,7 @@ xfs_calc_remove_reservation(
 {
 	return XFS_DQUOT_LOGRES(mp) +
 		xfs_calc_iunlink_add_reservation(mp) +
-		max((xfs_calc_inode_res(mp, 1) +
+		max((xfs_calc_inode_res(mp, 2) +
 		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
 				      XFS_FSB_TO_B(mp, 1))),
 		    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 14/17] xfs: avoid a UAF when log intent item recovery fails
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (11 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 13/17] xfs: fix inode reservation space for removing transaction Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 15/17] xfs: fix exception caused by unexpected illegal bestcount in leaf dir Leah Rumancik
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Darrick J. Wong,
	Christoph Hellwig, Leah Rumancik, Chandan Babu R

From: "Darrick J. Wong" <djwong@kernel.org>

[ Upstream commit 97cf79677ecb50a38517253ae2fd705849a7e51a ]

KASAN reported a UAF bug when I was running xfs/235:

 BUG: KASAN: use-after-free in xlog_recover_process_intents+0xa77/0xae0 [xfs]
 Read of size 8 at addr ffff88804391b360 by task mount/5680

 CPU: 2 PID: 5680 Comm: mount Not tainted 6.0.0-xfsx #6.0.0 77e7b52a4943a975441e5ac90a5ad7748b7867f6
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x34/0x44
  print_report.cold+0x2cc/0x682
  kasan_report+0xa3/0x120
  xlog_recover_process_intents+0xa77/0xae0 [xfs fb841c7180aad3f8359438576e27867f5795667e]
  xlog_recover_finish+0x7d/0x970 [xfs fb841c7180aad3f8359438576e27867f5795667e]
  xfs_log_mount_finish+0x2d7/0x5d0 [xfs fb841c7180aad3f8359438576e27867f5795667e]
  xfs_mountfs+0x11d4/0x1d10 [xfs fb841c7180aad3f8359438576e27867f5795667e]
  xfs_fs_fill_super+0x13d5/0x1a80 [xfs fb841c7180aad3f8359438576e27867f5795667e]
  get_tree_bdev+0x3da/0x6e0
  vfs_get_tree+0x7d/0x240
  path_mount+0xdd3/0x17d0
  __x64_sys_mount+0x1fa/0x270
  do_syscall_64+0x2b/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0
 RIP: 0033:0x7ff5bc069eae
 Code: 48 8b 0d 85 1f 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 52 1f 0f 00 f7 d8 64 89 01 48
 RSP: 002b:00007ffe433fd448 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ff5bc069eae
 RDX: 00005575d7213290 RSI: 00005575d72132d0 RDI: 00005575d72132b0
 RBP: 00005575d7212fd0 R08: 00005575d7213230 R09: 00005575d7213fe0
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
 R13: 00005575d7213290 R14: 00005575d72132b0 R15: 00005575d7212fd0
  </TASK>

 Allocated by task 5680:
  kasan_save_stack+0x1e/0x40
  __kasan_slab_alloc+0x66/0x80
  kmem_cache_alloc+0x152/0x320
  xfs_rui_init+0x17a/0x1b0 [xfs]
  xlog_recover_rui_commit_pass2+0xb9/0x2e0 [xfs]
  xlog_recover_items_pass2+0xe9/0x220 [xfs]
  xlog_recover_commit_trans+0x673/0x900 [xfs]
  xlog_recovery_process_trans+0xbe/0x130 [xfs]
  xlog_recover_process_data+0x103/0x2a0 [xfs]
  xlog_do_recovery_pass+0x548/0xc60 [xfs]
  xlog_do_log_recovery+0x62/0xc0 [xfs]
  xlog_do_recover+0x73/0x480 [xfs]
  xlog_recover+0x229/0x460 [xfs]
  xfs_log_mount+0x284/0x640 [xfs]
  xfs_mountfs+0xf8b/0x1d10 [xfs]
  xfs_fs_fill_super+0x13d5/0x1a80 [xfs]
  get_tree_bdev+0x3da/0x6e0
  vfs_get_tree+0x7d/0x240
  path_mount+0xdd3/0x17d0
  __x64_sys_mount+0x1fa/0x270
  do_syscall_64+0x2b/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

 Freed by task 5680:
  kasan_save_stack+0x1e/0x40
  kasan_set_track+0x21/0x30
  kasan_set_free_info+0x20/0x30
  ____kasan_slab_free+0x144/0x1b0
  slab_free_freelist_hook+0xab/0x180
  kmem_cache_free+0x1f1/0x410
  xfs_rud_item_release+0x33/0x80 [xfs]
  xfs_trans_free_items+0xc3/0x220 [xfs]
  xfs_trans_cancel+0x1fa/0x590 [xfs]
  xfs_rui_item_recover+0x913/0xd60 [xfs]
  xlog_recover_process_intents+0x24e/0xae0 [xfs]
  xlog_recover_finish+0x7d/0x970 [xfs]
  xfs_log_mount_finish+0x2d7/0x5d0 [xfs]
  xfs_mountfs+0x11d4/0x1d10 [xfs]
  xfs_fs_fill_super+0x13d5/0x1a80 [xfs]
  get_tree_bdev+0x3da/0x6e0
  vfs_get_tree+0x7d/0x240
  path_mount+0xdd3/0x17d0
  __x64_sys_mount+0x1fa/0x270
  do_syscall_64+0x2b/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

 The buggy address belongs to the object at ffff88804391b300
  which belongs to the cache xfs_rui_item of size 688
 The buggy address is located 96 bytes inside of
  688-byte region [ffff88804391b300, ffff88804391b5b0)

 The buggy address belongs to the physical page:
 page:ffffea00010e4600 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888043919320 pfn:0x43918
 head:ffffea00010e4600 order:2 compound_mapcount:0 compound_pincount:0
 flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff)
 raw: 04fff80000010200 0000000000000000 dead000000000122 ffff88807f0eadc0
 raw: ffff888043919320 0000000080140010 00000001ffffffff 0000000000000000
 page dumped because: kasan: bad access detected

 Memory state around the buggy address:
  ffff88804391b200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff88804391b280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 >ffff88804391b300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                        ^
  ffff88804391b380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff88804391b400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ==================================================================

The test fuzzes an rmap btree block and starts writer threads to induce
a filesystem shutdown on the corrupt block.  When the filesystem is
remounted, recovery will try to replay the committed rmap intent item,
but the corruption problem causes the recovery transaction to fail.
Cancelling the transaction frees the RUD, which frees the RUI that we
recovered.

When we return to xlog_recover_process_intents, @lip is now a dangling
pointer, and we cannot use it to find the iop_recover method for the
tracepoint.  Hence we must store the item ops before calling
->iop_recover if we want to give it to the tracepoint so that the trace
data will tell us exactly which intent item failed.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_log_recover.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 04961ebf16ea..3d844a250b71 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2560,6 +2560,7 @@ xlog_recover_process_intents(
 	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	     lip != NULL;
 	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+		const struct xfs_item_ops	*ops;
 		/*
 		 * We're done when we see something other than an intent.
 		 * There should be no intents left in the AIL now.
@@ -2584,13 +2585,17 @@ xlog_recover_process_intents(
 		 * deferred ops, you /must/ attach them to the capture list in
 		 * the recover routine or else those subsequent intents will be
 		 * replayed in the wrong order!
+		 *
+		 * The recovery function can free the log item, so we must not
+		 * access lip after it returns.
 		 */
 		spin_unlock(&ailp->ail_lock);
-		error = lip->li_ops->iop_recover(lip, &capture_list);
+		ops = lip->li_ops;
+		error = ops->iop_recover(lip, &capture_list);
 		spin_lock(&ailp->ail_lock);
 		if (error) {
 			trace_xlog_intent_recovery_failed(log->l_mp, error,
-					lip->li_ops->iop_recover);
+					ops->iop_recover);
 			break;
 		}
 	}
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 15/17] xfs: fix exception caused by unexpected illegal bestcount in leaf dir
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (12 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 14/17] xfs: avoid a UAF when log intent item recovery fails Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 16/17] xfs: fix memory leak in xfs_errortag_init Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 17/17] xfs: Fix unreferenced object reported by kmemleak in xfs_sysfs_init() Leah Rumancik
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Guo Xuenan, Hou Tao,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Guo Xuenan <guoxuenan@huawei.com>

[ Upstream commit 13cf24e00665c9751951a422756d975812b71173 ]

For leaf dir, In most cases, there should be as many bestfree slots
as the dir data blocks that can fit under i_size (except for [1]).

Root cause is we don't examin the number bestfree slots, when the slots
number less than dir data blocks, if we need to allocate new dir data
block and update the bestfree array, we will use the dir block number as
index to assign bestfree array, while we did not check the leaf buf
boundary which may cause UAF or other memory access problem. This issue
can also triggered with test cases xfs/473 from fstests.

According to Dave Chinner & Darrick's suggestion, adding buffer verifier
to detect this abnormal situation in time.
Simplify the testcase for fstest xfs/554 [1]

The error log is shown as follows:
==================================================================
BUG: KASAN: use-after-free in xfs_dir2_leaf_addname+0x1995/0x1ac0
Write of size 2 at addr ffff88810168b000 by task touch/1552
CPU: 5 PID: 1552 Comm: touch Not tainted 6.0.0-rc3+ #101
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x4d/0x66
 print_report.cold+0xf6/0x691
 kasan_report+0xa8/0x120
 xfs_dir2_leaf_addname+0x1995/0x1ac0
 xfs_dir_createname+0x58c/0x7f0
 xfs_create+0x7af/0x1010
 xfs_generic_create+0x270/0x5e0
 path_openat+0x270b/0x3450
 do_filp_open+0x1cf/0x2b0
 do_sys_openat2+0x46b/0x7a0
 do_sys_open+0xb7/0x130
 do_syscall_64+0x35/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fe4d9e9312b
Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0
75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00
f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25
RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b
RDX: 0000000000000941 RSI: 00007ffda4c17f33 RDI: 00000000ffffff9c
RBP: 00007ffda4c17f33 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941
R13: 00007fe4d9f631a4 R14: 00007ffda4c17f33 R15: 0000000000000000
 </TASK>

The buggy address belongs to the physical page:
page:ffffea000405a2c0 refcount:0 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x10168b
flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff)
raw: 002fffff80000000 ffffea0004057788 ffffea000402dbc8 0000000000000000
raw: 0000000000000000 0000000000170000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88810168af00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88810168af80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff88810168b000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                   ^
 ffff88810168b080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff88810168b100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==================================================================
Disabling lock debugging due to kernel taint
00000000: 58 44 44 33 5b 53 35 c2 00 00 00 00 00 00 00 78
XDD3[S5........x
XFS (sdb): Internal error xfs_dir2_data_use_free at line 1200 of file
fs/xfs/libxfs/xfs_dir2_data.c.  Caller
xfs_dir2_data_use_free+0x28a/0xeb0
CPU: 5 PID: 1552 Comm: touch Tainted: G    B              6.0.0-rc3+
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.13.0-1ubuntu1.1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x4d/0x66
 xfs_corruption_error+0x132/0x150
 xfs_dir2_data_use_free+0x198/0xeb0
 xfs_dir2_leaf_addname+0xa59/0x1ac0
 xfs_dir_createname+0x58c/0x7f0
 xfs_create+0x7af/0x1010
 xfs_generic_create+0x270/0x5e0
 path_openat+0x270b/0x3450
 do_filp_open+0x1cf/0x2b0
 do_sys_openat2+0x46b/0x7a0
 do_sys_open+0xb7/0x130
 do_syscall_64+0x35/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fe4d9e9312b
Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0
75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00
f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25
RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b
RDX: 0000000000000941 RSI: 00007ffda4c17f46 RDI: 00000000ffffff9c
RBP: 00007ffda4c17f46 R08: 0000000000000000 R09: 0000000000000001
R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941
R13: 00007fe4d9f631a4 R14: 00007ffda4c17f46 R15: 0000000000000000
 </TASK>
XFS (sdb): Corruption detected. Unmount and run xfs_repair

[1] https://lore.kernel.org/all/20220928095355.2074025-1-guoxuenan@huawei.com/
Reviewed-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_dir2_leaf.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index d9b66306a9a7..cb9e950a911d 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int(
 	xfs_dir2_leaf_tail_t		*ltp;
 	int				stale;
 	int				i;
+	bool				isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+						   hdr->magic == XFS_DIR3_LEAF1_MAGIC);
 
 	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
 
@@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int(
 		return __this_address;
 
 	/* Leaves and bests don't overlap in leaf format. */
-	if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
-	     hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+	if (isleaf1 &&
 	    (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
 		return __this_address;
 
@@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int(
 		}
 		if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			stale++;
+		if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+				be32_to_cpu(hdr->ents[i].address)) >=
+				be32_to_cpu(ltp->bestcount))
+			return __this_address;
 	}
 	if (hdr->stale != stale)
 		return __this_address;
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 16/17] xfs: fix memory leak in xfs_errortag_init
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (13 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 15/17] xfs: fix exception caused by unexpected illegal bestcount in leaf dir Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  2023-11-16  2:28 ` [PATCH 5.15 17/17] xfs: Fix unreferenced object reported by kmemleak in xfs_sysfs_init() Leah Rumancik
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Zeng Heng,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Zeng Heng <zengheng4@huawei.com>

[ Upstream commit cf4f4c12dea7a977a143c8fe5af1740b7f9876f8 ]

When `xfs_sysfs_init` returns failed, `mp->m_errortag` needs to free.
Otherwise kmemleak would report memory leak after mounting xfs image:

unreferenced object 0xffff888101364900 (size 192):
  comm "mount", pid 13099, jiffies 4294915218 (age 335.207s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<00000000f08ad25c>] __kmalloc+0x41/0x1b0
    [<00000000dca9aeb6>] kmem_alloc+0xfd/0x430
    [<0000000040361882>] xfs_errortag_init+0x20/0x110
    [<00000000b384a0f6>] xfs_mountfs+0x6ea/0x1a30
    [<000000003774395d>] xfs_fs_fill_super+0xe10/0x1a80
    [<000000009cf07b6c>] get_tree_bdev+0x3e7/0x700
    [<00000000046b5426>] vfs_get_tree+0x8e/0x2e0
    [<00000000952ec082>] path_mount+0xf8c/0x1990
    [<00000000beb1f838>] do_mount+0xee/0x110
    [<000000000e9c41bb>] __x64_sys_mount+0x14b/0x1f0
    [<00000000f7bb938e>] do_syscall_64+0x3b/0x90
    [<000000003fcd67a9>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: c68401011522 ("xfs: expose errortag knobs via sysfs")
Signed-off-by: Zeng Heng <zengheng4@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_error.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 81c445e9489b..b0ccec92e015 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -224,13 +224,18 @@ int
 xfs_errortag_init(
 	struct xfs_mount	*mp)
 {
+	int ret;
+
 	mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
 			KM_MAYFAIL);
 	if (!mp->m_errortag)
 		return -ENOMEM;
 
-	return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
-			       &mp->m_kobj, "errortag");
+	ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+				&mp->m_kobj, "errortag");
+	if (ret)
+		kmem_free(mp->m_errortag);
+	return ret;
 }
 
 void
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5.15 17/17] xfs: Fix unreferenced object reported by kmemleak in xfs_sysfs_init()
  2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
                   ` (14 preceding siblings ...)
  2023-11-16  2:28 ` [PATCH 5.15 16/17] xfs: fix memory leak in xfs_errortag_init Leah Rumancik
@ 2023-11-16  2:28 ` Leah Rumancik
  15 siblings, 0 replies; 22+ messages in thread
From: Leah Rumancik @ 2023-11-16  2:28 UTC (permalink / raw)
  To: stable
  Cc: linux-xfs, amir73il, chandan.babu, fred, Li Zetao,
	Darrick J . Wong, Leah Rumancik, Chandan Babu R

From: Li Zetao <lizetao1@huawei.com>

[ Upstream commit d08af40340cad0e025d643c3982781a8f99d5032 ]

kmemleak reported a sequence of memory leaks, and one of them indicated we
failed to free a pointer:
  comm "mount", pid 19610, jiffies 4297086464 (age 60.635s)
    hex dump (first 8 bytes):
      73 64 61 00 81 88 ff ff                          sda.....
    backtrace:
      [<00000000d77f3e04>] kstrdup_const+0x46/0x70
      [<00000000e51fa804>] kobject_set_name_vargs+0x2f/0xb0
      [<00000000247cd595>] kobject_init_and_add+0xb0/0x120
      [<00000000f9139aaf>] xfs_mountfs+0x367/0xfc0
      [<00000000250d3caf>] xfs_fs_fill_super+0xa16/0xdc0
      [<000000008d873d38>] get_tree_bdev+0x256/0x390
      [<000000004881f3fa>] vfs_get_tree+0x41/0xf0
      [<000000008291ab52>] path_mount+0x9b3/0xdd0
      [<0000000022ba8f2d>] __x64_sys_mount+0x190/0x1d0

As mentioned in kobject_init_and_add() comment, if this function
returns an error, kobject_put() must be called to properly clean up
the memory associated with the object. Apparently, xfs_sysfs_init()
does not follow such a requirement. When kobject_init_and_add()
returns an error, the space of kobj->kobject.name alloced by
kstrdup_const() is unfree, which will cause the above stack.

Fix it by adding kobject_put() when kobject_init_and_add returns an
error.

Fixes: a31b1d3d89e4 ("xfs: add xfs_mount sysfs kobject")
Signed-off-by: Li Zetao <lizetao1@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Acked-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_sysfs.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 43585850f154..513095e353a5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -33,10 +33,15 @@ xfs_sysfs_init(
 	const char		*name)
 {
 	struct kobject		*parent;
+	int err;
 
 	parent = parent_kobj ? &parent_kobj->kobject : NULL;
 	init_completion(&kobj->complete);
-	return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+	err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+	if (err)
+		kobject_put(&kobj->kobject);
+
+	return err;
 }
 
 static inline void
-- 
2.43.0.rc0.421.g78406f8d94-goog


^ permalink raw reply related	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2023-11-21  5:18 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-16  2:28 [PATCH 5.15 01/17] xfs: refactor buffer cancellation table allocation Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 02/17] xfs: don't leak xfs_buf_cancel structures when recovery fails Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 03/17] xfs: convert buf_cancel_table allocation to kmalloc_array Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 04/17] xfs: use invalidate_lock to check the state of mmap_lock Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 05/17] xfs: prevent a UAF when log IO errors race with unmount Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 06/17] xfs: flush inode gc workqueue before clearing agi bucket Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 07/17] xfs: fix use-after-free in xattr node block inactivation Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 08/17] xfs: don't leak memory when attr fork loading fails Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 09/17] xfs: fix NULL pointer dereference in xfs_getbmap() Leah Rumancik
2023-11-20 15:38   ` Greg KH
2023-11-20 19:11     ` Darrick J. Wong
2023-11-21  5:18       ` Greg KH
2023-11-21  2:50     ` Sasha Levin
2023-11-21  5:17       ` Greg KH
2023-11-16  2:28 ` [PATCH 5.15 10/17] xfs: fix intermittent hang during quotacheck Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 11/17] xfs: add missing cmap->br_state = XFS_EXT_NORM update Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 12/17] xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 13/17] xfs: fix inode reservation space for removing transaction Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 14/17] xfs: avoid a UAF when log intent item recovery fails Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 15/17] xfs: fix exception caused by unexpected illegal bestcount in leaf dir Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 16/17] xfs: fix memory leak in xfs_errortag_init Leah Rumancik
2023-11-16  2:28 ` [PATCH 5.15 17/17] xfs: Fix unreferenced object reported by kmemleak in xfs_sysfs_init() Leah Rumancik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox