[PATCH RFCRAP 0/5] xfs: atomic file metadata repairs

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs
@ 2020-04-29  2:46 Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 1/5] xfs: parent repair should try the dcache first Darrick J. Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

Hi all,

This series connects the atomic extent swap mechanism in the previous
series to the online filesystem repair code.  This enables repair
functions to construct a clean copy of a directory, xattr information,
realtime bitmaps, and realtime summary information in a temporary inode.
If this completes successfully, the new contents can be swapped
atomically into the inode being repaired.  This is essential to avoid
making corruption problems worse if the system goes down in the middle
of running repair.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-metadata-atomically

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] xfs: parent repair should try the dcache first
  2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
@ 2020-04-29  2:46 ` Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 2/5] xfs: create temporary files and directories for online repair Darrick J. Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

If we need to find a directory's parent, try the dcache first.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/dir_repair.c    |    7 +++++-
 fs/xfs/scrub/parent.h        |    1 +
 fs/xfs/scrub/parent_repair.c |   47 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 49 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 33e98e4172db..b299f8b35ce4 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -728,11 +728,16 @@ xrep_dir_validate_parent(
 
 	/*
 	 * If the directory salvage scan found no parent or found an obviously
-	 * incorrect parent, jump to the filesystem scan.
+	 * incorrect parent, try asking the dcache for the parent.
+	 *
+	 * If the dcache doesn't know about a parent or the parent seems
+	 * obviously incorrect, jump to the filesystem scan.
 	 *
 	 * Otherwise, if the alleged parent seems plausible, scan the directory
 	 * to make sure it really points to us.
 	 */
+	if (!xrep_parent_acceptable(sc, rd->parent_ino))
+		rd->parent_ino = xrep_parent_check_dcache(sc->ip);
 	if (!xrep_parent_acceptable(sc, rd->parent_ino))
 		goto scan;
 
diff --git a/fs/xfs/scrub/parent.h b/fs/xfs/scrub/parent.h
index 6c79f7f99e9e..62db392b19a5 100644
--- a/fs/xfs/scrub/parent.h
+++ b/fs/xfs/scrub/parent.h
@@ -14,5 +14,6 @@ typedef int (*xrep_parents_iter_fn)(struct xfs_inode *dp, struct xfs_name *name,
 int xrep_scan_for_parents(struct xfs_scrub *sc, xfs_ino_t target_ino,
 		xrep_parents_iter_fn fn, void *data);
 bool xrep_parent_acceptable(struct xfs_scrub *sc, xfs_ino_t ino);
+xfs_ino_t xrep_parent_check_dcache(struct xfs_inode *dp);
 
 #endif /* __XFS_SCRUB_PARENT_H__ */
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 3d3993ba920d..44cd7da405e5 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -174,6 +174,37 @@ xrep_parents_scan_inode(
 	return error;
 }
 
+/* Does the dcache have a parent for this directory? */
+xfs_ino_t
+xrep_parent_check_dcache(
+	struct xfs_inode	*dp)
+{
+	struct inode		*pip = NULL;
+	struct dentry		*dentry, *parent;
+	xfs_ino_t		ret = NULLFSINO;
+
+	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+	dentry = d_find_alias(VFS_I(dp));
+	if (!dentry)
+		goto out;
+
+	parent = dget_parent(dentry);
+	if (!parent)
+		goto out_dput;
+
+	pip = igrab(d_inode(parent));
+	dput(parent);
+
+	ret = pip->i_ino;
+	xfs_irele(XFS_I(pip));
+
+out_dput:
+	dput(dentry);
+out:
+	return ret;
+}
+
 /* Is this an acceptable parent for the inode we're scrubbing? */
 bool
 xrep_parent_acceptable(
@@ -271,11 +302,17 @@ xrep_parent(
 	if (sick & XFS_SICK_INO_DIR)
 		return -EFSCORRUPTED;
 
-	/* Scan the entire directory tree for the directory's parent. */
-	error = xrep_scan_for_parents(sc, sc->ip->i_ino, xrep_parent_absorb,
-			&rp);
-	if (error)
-		return error;
+	/*
+	 * Ask the dcache who it thinks the parent might be.  If that doesn't
+	 * pass muster, scan the entire filesystem for the directory's parent.
+	 */
+	rp.parent_ino = xrep_parent_check_dcache(sc->ip);
+	if (!xrep_parent_acceptable(sc, rp.parent_ino)) {
+		error = xrep_scan_for_parents(sc, sc->ip->i_ino,
+				xrep_parent_absorb, &rp);
+		if (error)
+			return error;
+	}
 
 	/* If we still don't have a parent, bail out. */
 	if (!xrep_parent_acceptable(sc, rp.parent_ino))


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] xfs: create temporary files and directories for online repair
  2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 1/5] xfs: parent repair should try the dcache first Darrick J. Wong
@ 2020-04-29  2:46 ` Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 3/5] xfs: use atomic extent swapping to repair rt metadata Darrick J. Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

Teach the online repair code how to create temporary files or
directories.  These temporary files can be used to stage reconstructed
information until we're ready to perform an atomic extent swap to commit
the new metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c |  122 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |    1 
 fs/xfs/scrub/scrub.c  |    6 ++
 fs/xfs/scrub/scrub.h  |    3 +
 4 files changed, 132 insertions(+)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c134804bc5a1..0ec483d511cd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -31,6 +31,9 @@
 #include "xfs_attr.h"
 #include "xfs_reflink.h"
 #include "xfs_health.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir2.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -1502,6 +1505,125 @@ xrep_metadata_inode_forks(
 	return error;
 }
 
+/* Create a temporary file or directory. */
+int
+xrep_create_tempfile(
+	struct xfs_scrub	*sc,
+	uint16_t		mode)
+{
+	struct xfs_ialloc_args	args = {
+		.pip		= sc->mp->m_rootip,
+		.nlink		= 0,
+		.mode		= mode,
+	};
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_trans	*tp = NULL;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	struct xfs_trans_res	*tres;
+	unsigned int		resblks;
+	bool			is_dir = S_ISDIR(mode);
+	int			error;
+
+	ASSERT(sc->tp == NULL);
+	ASSERT(sc->tempip == NULL);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.  The temporary
+	 * inode should be completely root owned, but we'll still go through
+	 * the motions to keep the quota accounting accurate.
+	 */
+	error = xfs_qm_vop_dqalloc(sc->mp->m_rootip, args.uid, args.gid,
+			args.prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+			&udqp, &gdqp, &pdqp);
+	if (error)
+		return error;
+
+	if (is_dir) {
+		resblks = XFS_MKDIR_SPACE_RES(mp, 0);
+		tres = &M_RES(mp)->tr_mkdir;
+	} else {
+		resblks = XFS_IALLOC_SPACE_RES(mp);
+		tres = &M_RES(mp)->tr_create_tmpfile;
+	}
+
+	error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+	if (error)
+		goto out_release_inode;
+
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks,
+			1, 0);
+	if (error)
+		goto out_trans_cancel;
+
+	/* Allocate inode, set up directory. */
+	error = xfs_dir_ialloc(&tp, &args, &sc->tempip);
+	if (error)
+		goto out_trans_cancel;
+
+	if (is_dir) {
+		error = xfs_dir_init(tp, sc->tempip, sc->mp->m_rootip);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	/*
+	 * Attach the dquot(s) to the inodes and modify them incore.
+	 * These ids of the inode couldn't have changed since the new
+	 * inode has been locked ever since it was created.
+	 */
+	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
+
+	/*
+	 * Put our temp file on the unlinked list so it's purged automatically.
+	 * Anything being reconstructed using this file must be atomically
+	 * swapped with the original file because the contents here will be
+	 * purged when the inode is dropped or log recovery cleans out the
+	 * unlinked list.
+	 */
+	error = xfs_iunlink(tp, sc->tempip);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_release_inode;
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	/* Finish setting up the incore / vfs context. */
+	xfs_setup_iops(sc->tempip);
+	xfs_finish_inode_setup(sc->tempip);
+
+	sc->temp_ilock_flags = 0;
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to finish the
+	 * setup of the inode and release the inode.  This prevents recursive
+	 * transactions and deadlocks from xfs_inactive.
+	 */
+	if (sc->tempip) {
+		xfs_finish_inode_setup(sc->tempip);
+		xfs_irele(sc->tempip);
+	}
+
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+	xfs_qm_dqrele(pdqp);
+
+	return error;
+}
+
 /*
  * Make sure that the given range of the data fork of the metadata file being
  * checked is mapped to written blocks.  The caller must ensure that the inode
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9388b3ce1cb8..299d39360c11 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -32,6 +32,7 @@ int xrep_alloc_ag_block(struct xfs_scrub *sc,
 int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
 		struct xfs_buf **bpp, xfs_btnum_t btnum,
 		const struct xfs_buf_ops *ops);
+int xrep_create_tempfile(struct xfs_scrub *sc, uint16_t mode);
 int xrep_fallocate(struct xfs_scrub *sc, xfs_fileoff_t off, xfs_filblks_t len);
 
 typedef int (*xrep_setfile_getbuf_fn)(struct xfs_scrub *sc,
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index a2911a01cf68..a6f5b5c21f3f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -194,6 +194,12 @@ xchk_teardown(
 		kmem_free(sc->buf);
 		sc->buf = NULL;
 	}
+	if (sc->tempip) {
+		if (sc->temp_ilock_flags)
+			xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
+		xfs_irele(sc->tempip);
+		sc->tempip = NULL;
+	}
 	return error;
 }
 
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index b8d582808cd3..798942bd7eaf 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -72,6 +72,9 @@ struct xfs_scrub {
 	struct file			*xfile;
 	uint				ilock_flags;
 
+	struct xfs_inode		*tempip;
+	uint				temp_ilock_flags;
+
 	/* See the XCHK/XREP state flags below. */
 	unsigned int			flags;
 


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] xfs: use atomic extent swapping to repair rt metadata
  2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 1/5] xfs: parent repair should try the dcache first Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 2/5] xfs: create temporary files and directories for online repair Darrick J. Wong
@ 2020-04-29  2:46 ` Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 4/5] xfs: use atomic extent swapping to repair extended attributes Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 5/5] xfs: use atomic extent swapping to repair directories Darrick J. Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

When repairing realtime volume metadata online, stage the new directory
contents in a temporary file and use the atomic extent swapping
mechanism to commit the results in bulk.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/repair.c           |   34 ++++++++++++++++++++--------
 fs/xfs/scrub/rtbitmap.c         |   12 ++++++++++
 fs/xfs/scrub/rtbitmap_repair.c  |   48 +++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/rtsummary.c        |   12 ++++++++++
 fs/xfs/scrub/rtsummary_repair.c |   46 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 15 deletions(-)


diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 0ec483d511cd..5b876b02b9f4 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1636,13 +1636,13 @@ xrep_fallocate(
 	xfs_filblks_t		len)
 {
 	struct xfs_bmbt_irec	map;
+	struct xfs_inode	*ip = sc->tempip;
 	xfs_fileoff_t		end = off + len;
 	int			nmaps;
 	int			error = 0;
 
-	error = xrep_ino_dqattach(sc);
-	if (error)
-		return error;
+	ASSERT(sc->tempip != NULL);
+	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, ip));
 
 	while (off < len) {
 		/*
@@ -1650,7 +1650,7 @@ xrep_fallocate(
 		 * in ok shape.
 		 */
 		nmaps = 1;
-		error = xfs_bmapi_read(sc->ip, off, end - off, &map, &nmaps,
+		error = xfs_bmapi_read(ip, off, end - off, &map, &nmaps,
 				XFS_DATA_FORK);
 		if (error)
 			break;
@@ -1672,15 +1672,21 @@ xrep_fallocate(
 		 * allocated to it.
 		 */
 		nmaps = 1;
-		error = xfs_bmapi_write(sc->tp, sc->ip, off, end - off,
+		error = xfs_bmapi_write(sc->tp, ip, off, end - off,
 				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
 				&nmaps);
 		if (error)
 			break;
 
-		error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+		/*
+		 * Roll the transaction with the inode we're fixing and the
+		 * temp inode, so that neither can pin the log.
+		 */
+		xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+		error = xfs_trans_roll_inode(&sc->tp, ip);
 		if (error)
 			break;
+		xfs_trans_ijoin(sc->tp, sc->ip, 0);
 		off += map.br_startblock;
 	}
 
@@ -1701,6 +1707,7 @@ xrep_set_file_contents(
 {
 	struct list_head	buffers_list;
 	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->tempip;
 	struct xfs_buf		*bp;
 	xfs_rtblock_t		off = 0;
 	loff_t			pos = 0;
@@ -1744,12 +1751,19 @@ xrep_set_file_contents(
 	}
 
 	/* Set the new inode size, if needed. */
-	if (sc->ip->i_d.di_size != isize) {
-		sc->ip->i_d.di_size = isize;
-		xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	if (ip->i_d.di_size != isize) {
+		ip->i_d.di_size = isize;
+		xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
 	}
 
-	return xfs_trans_roll_inode(&sc->tp, sc->ip);
+	/*
+	 * Roll transaction, being careful to keep the tempfile and the
+	 * metadata inode joined.
+	 */
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	error = xfs_trans_roll_inode(&sc->tp, ip);
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	return error;
 out:
 	xfs_buf_delwri_cancel(&buffers_list);
 	return error;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8488d137bf92..c3396d9ead49 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -20,6 +20,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /* Set us up with the realtime metadata locked. */
 int
@@ -29,6 +30,17 @@ xchk_setup_rt(
 {
 	int			error;
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		if (!xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
+			return -EOPNOTSUPP;
+
+		error = xrep_create_tempfile(sc, S_IFREG);
+		if (error)
+			return error;
+	}
+#endif
+
 	error = xchk_setup_fs(sc, ip);
 	if (error)
 		return error;
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
index 229dd23d9d3e..d812efe8dd2a 100644
--- a/fs/xfs/scrub/rtbitmap_repair.c
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -18,6 +18,7 @@
 #include "xfs_bmap.h"
 #include "xfs_rmap.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_swapext.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -207,7 +208,29 @@ xrep_rtbitmap_get_buf(
 	xfs_fileoff_t		off,
 	struct xfs_buf		**bpp)
 {
-	return xfs_rtbuf_get(sc->mp, sc->tp, off, 0, bpp);
+	struct xfs_bmbt_irec	map;
+	struct xfs_buf		*bp;
+	struct xfs_mount	*mp = sc->mp;
+	int			nmap = 1;
+	int			error;
+
+	error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmap,
+			XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+		return -EFSCORRUPTED;
+
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, map.br_startblock),
+			mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF);
+	*bpp = bp;
+	return 0;
 }
 
 /* Repair the realtime bitmap. */
@@ -221,8 +244,12 @@ xrep_rtbitmap(
 	xfs_fileoff_t		bmp_bytes;
 	int			error;
 
-	/* We require the realtime rmapbt to rebuild anything. */
-	if (!xfs_sb_version_hasrtrmapbt(&sc->mp->m_sb))
+	/*
+	 * We require the realtime rmapbt and atomic file updates to rebuild
+	 * anything.
+	 */
+	if (!xfs_sb_version_hasrtrmapbt(&sc->mp->m_sb) ||
+	    !xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
 		return -EOPNOTSUPP;
 
 	bmp_bytes = XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks);
@@ -240,8 +267,17 @@ xrep_rtbitmap(
 	if (error)
 		goto out;
 
+	/*
+	 * Trylock the temporary file.  We had better be the only ones holding
+	 * onto this inode...
+	 */
+	if (!xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL))
+		return -EAGAIN;
+	sc->temp_ilock_flags = XFS_ILOCK_EXCL;
+
 	/* Make sure we have space allocated for the entire bitmap file. */
 	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
 	error = xrep_fallocate(sc, 0, sc->mp->m_sb.sb_rbmblocks);
 	if (error)
 		goto out;
@@ -249,6 +285,12 @@ xrep_rtbitmap(
 	/* Copy the bitmap file that we generated. */
 	error = xrep_set_file_contents(sc, xrep_rtbitmap_get_buf, rb.bmpfile,
 			bmp_bytes);
+	if (error)
+		goto out;
+
+	/* Now swap the extents. */
+	error = xfs_swapext_atomic(&sc->tp, sc->ip, sc->tempip, XFS_DATA_FORK,
+			0, 0, sc->mp->m_sb.sb_rbmblocks, 0);
 out:
 	fput(rb.bmpfile);
 	return error;
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index e2b4638fa7cc..ccb220c184f1 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -20,6 +20,7 @@
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/xfile.h"
+#include "scrub/repair.h"
 
 /*
  * Realtime Summary
@@ -61,6 +62,17 @@ xchk_setup_rtsummary(
 	struct xfs_mount	*mp = sc->mp;
 	int			error;
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		if (!xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
+			return -EOPNOTSUPP;
+
+		error = xrep_create_tempfile(sc, S_IFREG);
+		if (error)
+			return error;
+	}
+#endif
+
 	error = xchk_setup_fs(sc, ip);
 	if (error)
 		return error;
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index 78814b6a9c71..9c1fd759b730 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -16,6 +16,7 @@
 #include "xfs_inode.h"
 #include "xfs_bit.h"
 #include "xfs_bmap.h"
+#include "xfs_swapext.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -28,7 +29,29 @@ xrep_rtsum_get_buf(
 	xfs_fileoff_t		off,
 	struct xfs_buf		**bpp)
 {
-	return xfs_rtbuf_get(sc->mp, sc->tp, off, 1, bpp);
+	struct xfs_bmbt_irec	map;
+	struct xfs_buf		*bp;
+	struct xfs_mount	*mp = sc->mp;
+	int			nmap = 1;
+	int			error;
+
+	error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmap,
+			XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+		return -EFSCORRUPTED;
+
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, map.br_startblock),
+			mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
+	*bpp = bp;
+	return 0;
 }
 
 /* Repair the realtime summary. */
@@ -38,18 +61,37 @@ xrep_rtsummary(
 {
 	int			error;
 
+	/* We require atomic file swap to be able to fix rt summaries. */
+	if (!xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
+		return -EOPNOTSUPP;
+
 	/* Make sure any problems with the fork are fixed. */
 	error = xrep_metadata_inode_forks(sc);
 	if (error)
 		return error;
 
+	/*
+	 * Trylock the temporary file.  We had better be the only ones holding
+	 * onto this inode...
+	 */
+	if (!xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL))
+		return -EAGAIN;
+	sc->temp_ilock_flags = XFS_ILOCK_EXCL;
+
 	/* Make sure we have space allocated for the entire summary file. */
 	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
 	error = xrep_fallocate(sc, 0, XFS_B_TO_FSB(sc->mp, sc->mp->m_rsumsize));
 	if (error)
 		return error;
 
 	/* Copy the rtsummary file that we generated. */
-	return xrep_set_file_contents(sc, xrep_rtsum_get_buf, sc->xfile,
+	error = xrep_set_file_contents(sc, xrep_rtsum_get_buf, sc->xfile,
 			sc->mp->m_rsumsize);
+	if (error)
+		return error;
+
+	/* Now swap the extents. */
+	return xfs_swapext_atomic(&sc->tp, sc->ip, sc->tempip, XFS_DATA_FORK,
+			0, 0, XFS_B_TO_FSB(sc->mp, sc->mp->m_rsumsize), 0);
 }


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] xfs: use atomic extent swapping to repair extended attributes
  2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
                   ` (2 preceding siblings ...)
  2020-04-29  2:46 ` [PATCH 3/5] xfs: use atomic extent swapping to repair rt metadata Darrick J. Wong
@ 2020-04-29  2:46 ` Darrick J. Wong
  2020-04-29  2:46 ` [PATCH 5/5] xfs: use atomic extent swapping to repair directories Darrick J. Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

When repairing extended attributes online, stage the new xattr contents
in a temporary file and use the atomic extent swapping mechanism to
commit the results in bulk.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_log_format.h |    6 -
 fs/xfs/libxfs/xfs_swapext.c    |   53 +++++
 fs/xfs/libxfs/xfs_swapext.h    |    1 
 fs/xfs/scrub/attr.c            |   12 +
 fs/xfs/scrub/attr_repair.c     |  453 ++++++++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/bitmap.c          |   22 ++
 fs/xfs/scrub/bitmap.h          |    1 
 fs/xfs/xfs_bmap_util.c         |    2 
 fs/xfs/xfs_bmap_util.h         |    3 
 fs/xfs/xfs_trace.h             |    3 
 10 files changed, 525 insertions(+), 31 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index ceb67213df64..5fc1a11572da 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -813,8 +813,12 @@ struct xfs_swap_extent {
 /* Set the file sizes when finished. */
 #define XFS_SWAP_EXTENT_SET_SIZES	(1ULL << 1)
 
+/* Try to convert inode2 from block to short format at the end, if possible. */
+#define XFS_SWAP_EXTENT_TO_SHORTFORM2	(1ULL << 2)
+
 #define XFS_SWAP_EXTENT_FLAGS		(XFS_SWAP_EXTENT_ATTR_FORK | \
-					 XFS_SWAP_EXTENT_SET_SIZES)
+					 XFS_SWAP_EXTENT_SET_SIZES | \
+					 XFS_SWAP_EXTENT_TO_SHORTFORM2)
 
 /* This is the structure used to lay out an sxi log item in the log. */
 struct xfs_sxi_log_format {
diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c
index 64083d48fb7d..f16f7d9a0b66 100644
--- a/fs/xfs/libxfs/xfs_swapext.c
+++ b/fs/xfs/libxfs/xfs_swapext.c
@@ -20,6 +20,9 @@
 #include "xfs_trace.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
 
 /* Information to help us reset reflink flag / CoW fork state after a swap. */
 
@@ -200,12 +203,45 @@ xfs_swapext_update_size(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
+/* Convert inode2's leaf attr fork back to shortform, if possible.. */
+STATIC int
+xfs_swapext_attr_to_shortform2(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_da_args	args = {
+		.dp		= sxi->si_ip2,
+		.geo		= tp->t_mountp->m_attr_geo,
+		.whichfork	= XFS_ATTR_FORK,
+		.trans		= tp,
+	};
+	struct xfs_buf		*bp;
+	int			forkoff;
+	int			error;
+
+	if (!xfs_bmap_one_block(sxi->si_ip2, XFS_ATTR_FORK))
+		return 0;
+
+	error = xfs_attr3_leaf_read(tp, sxi->si_ip2, 0, &bp);
+	if (error)
+		return error;
+
+	forkoff = xfs_attr_shortform_allfit(bp, sxi->si_ip2);
+	if (forkoff == 0)
+		return 0;
+
+	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
+}
+
+#define XFS_SWAP_EXTENT_POST_PROCESSING (XFS_SWAP_EXTENT_TO_SHORTFORM2)
+
 /* Do we have more work to do to finish this operation? */
 bool
 xfs_swapext_has_more_work(
 	struct xfs_swapext_intent	*sxi)
 {
-	return sxi->si_blockcount > 0;
+	return sxi->si_blockcount > 0 ||
+		(sxi->si_flags & XFS_SWAP_EXTENT_POST_PROCESSING);
 }
 
 /* Finish one extent swap, possibly log more. */
@@ -218,12 +254,23 @@ xfs_swapext_finish_one(
 	int				whichfork;
 	int				nimaps;
 	int				bmap_flags;
-	int				error;
+	int				error = 0;
 
 	whichfork = (sxi->si_flags & XFS_SWAP_EXTENT_ATTR_FORK) ?
 			XFS_ATTR_FORK : XFS_DATA_FORK;
 	bmap_flags = xfs_bmapi_aflag(whichfork);
 
+	/* Do any post-processing work that we requires a transaction roll. */
+	if (sxi->si_blockcount == 0) {
+		if (sxi->si_flags & XFS_SWAP_EXTENT_TO_SHORTFORM2) {
+			if (sxi->si_flags & XFS_SWAP_EXTENT_ATTR_FORK)
+				error = xfs_swapext_attr_to_shortform2(tp, sxi);
+			sxi->si_flags &= ~XFS_SWAP_EXTENT_TO_SHORTFORM2;
+			return error;
+		}
+		return 0;
+	}
+
 	while (sxi->si_blockcount > 0) {
 		int64_t		ip1_delta = 0, ip2_delta = 0;
 
@@ -385,6 +432,8 @@ xfs_swapext_init_intent(
 		sxi->si_isize1 = ip2->i_d.di_size;
 		sxi->si_isize2 = ip1->i_d.di_size;
 	}
+	if (flags & XFS_SWAPEXT_TO_SHORTFORM2)
+		sxi->si_flags |= XFS_SWAP_EXTENT_TO_SHORTFORM2;
 	sxi->si_ip1 = ip1;
 	sxi->si_ip2 = ip2;
 	sxi->si_startoff1 = startoff1;
diff --git a/fs/xfs/libxfs/xfs_swapext.h b/fs/xfs/libxfs/xfs_swapext.h
index f4146f55a4c9..2ac08a25f0d9 100644
--- a/fs/xfs/libxfs/xfs_swapext.h
+++ b/fs/xfs/libxfs/xfs_swapext.h
@@ -49,6 +49,7 @@ int xfs_swapext_finish_one(struct xfs_trans *tp,
 		struct xfs_swapext_intent *sxi_state);
 
 #define XFS_SWAPEXT_SET_SIZES		(1U << 0)
+#define XFS_SWAPEXT_TO_SHORTFORM2	(1U << 1)
 int xfs_swapext_atomic(struct xfs_trans **tpp, struct xfs_inode *ip1,
 		struct xfs_inode *ip2, int whichfork, xfs_fileoff_t startoff1,
 		xfs_fileoff_t startoff2, xfs_filblks_t blockcount,
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index faacdb9f9f1e..b2cde8cd8244 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -19,6 +19,7 @@
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
 #include "scrub/attr.h"
+#include "scrub/repair.h"
 
 /*
  * Allocate enough memory to hold an attr value and attr block bitmaps,
@@ -80,6 +81,17 @@ xchk_setup_xattr(
 {
 	int			error;
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		if (!xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
+			return -EOPNOTSUPP;
+
+		error = xrep_create_tempfile(sc, S_IFREG);
+		if (error)
+			return error;
+	}
+#endif
+
 	/*
 	 * We failed to get memory while checking attrs, so this time try to
 	 * get all the memory we're ever going to need.  Allocate the buffer
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
index f1d7b1808498..d2563dd6c2d2 100644
--- a/fs/xfs/scrub/attr_repair.c
+++ b/fs/xfs/scrub/attr_repair.c
@@ -24,6 +24,8 @@
 #include "xfs_attr_sf.h"
 #include "xfs_attr_remote.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_swapext.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -445,14 +447,15 @@ xrep_attr_walk_blind(
 				len--;
 			}
 
-			if (flags & XREP_ATTR_WALK_INCORE)
-				continue;
-
 			/*
-			 * If we didn't find a buffer, read 1 block from disk.
-			 * We don't attach any buffer ops.
+			 * If we didn't find an incore buffer, reset len to 1
+			 * so that we can make forward progress.
 			 */
 			len = 1;
+
+			if (flags & XREP_ATTR_WALK_INCORE)
+				continue;
+
 			error = xfs_buf_read(mp->m_ddev_targp, daddr,
 					XFS_FSB_TO_BB(mp, len),
 					XBF_TRYLOCK, &bp, NULL);
@@ -699,7 +702,8 @@ xrep_xattr_insert_rec(
 	 */
 	name[XATTR_NAME_MAX] = 0;
 
-	error = xblob_get(rx->xattr_blobs, key->name_cookie, name, key->namelen);
+	error = xblob_get(rx->xattr_blobs, key->name_cookie, name,
+			key->namelen);
 	if (error)
 		return error;
 
@@ -718,10 +722,10 @@ xrep_xattr_insert_rec(
 
 	name[key->namelen] = 0;
 
-	trace_xrep_xattr_insert_rec(rx->sc->ip, key->flags, name, key->namelen,
-			key->valuelen);
+	trace_xrep_xattr_insert_rec(rx->sc->tempip, key->flags, name,
+			key->namelen, key->valuelen);
 
-	args.dp = rx->sc->ip;
+	args.dp = rx->sc->tempip;
 	args.attr_filter = key->flags;
 	args.name = name;
 	args.namelen = key->namelen;
@@ -731,20 +735,407 @@ xrep_xattr_insert_rec(
 }
 
 /*
- * Insert all the attributes that we collected.
+ * Prepare both inodes' attribute forks for extent swapping.  Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format attr fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_xattr_swap_prep(
+	struct xfs_scrub	*sc,
+	bool			temp_local,
+	bool			ip_local)
+{
+	int			error;
+
+	/*
+	 * If the tempfile's attributes are in shortform format, convert that
+	 * to a single leaf extent so that we can use the atomic extent swap.
+	 */
+	if (temp_local) {
+		struct xfs_buf		*leaf_bp = NULL;
+		struct xfs_da_args	args = {
+			.dp		= sc->tempip,
+			.geo		= sc->mp->m_attr_geo,
+			.whichfork	= XFS_ATTR_FORK,
+			.trans		= sc->tp,
+			.total		= 1,
+		};
+
+		error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
+		if (error)
+			return error;
+
+		/*
+		 * Roll the deferred log items to get us back to a clean
+		 * transaction.  Hold on to the leaf buffer across this roll
+		 * so that the AIL cannot grab our half-baked block.
+		 */
+		xfs_trans_bhold(sc->tp, leaf_bp);
+		error = xfs_defer_finish(&sc->tp);
+		xfs_trans_bhold_release(sc->tp, leaf_bp);
+	}
+
+	/*
+	 * If the file being repaired had a shortform attribute fork, convert
+	 * that to an empty extent list in preparation for the atomic extent
+	 * swap.
+	 */
+	if (ip_local) {
+		struct xfs_ifork	*ifp;
+
+		sc->ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+		sc->ip->i_d.di_anextents = 0;
+
+		ifp = XFS_IFORK_PTR(sc->ip, XFS_ATTR_FORK);
+		xfs_ifork_reset(ifp);
+		ifp->if_bytes = 0;
+		ifp->if_u1.if_root = NULL;
+		ifp->if_height = 0;
+		ifp->if_flags |= XFS_IFEXTENTS;
+
+		xfs_trans_log_inode(sc->tp, sc->ip,
+				XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	}
+
+	return 0;
+}
+
+/* State we need to track while rewriting attr block owners. */
+struct xrep_xattr_swap_owner {
+	struct xfs_attr_list_context	ctx;
+	struct xbitmap			rmt_blocks;
+	struct xfs_scrub		*sc;
+};
+
+/*
+ * Change the owner field of a remote attribute value block to match the file
+ * that's being repaired.  In-core buffers for these values span a single
+ * extent and are never logged, so we must be careful to mask off the
+ * corresponding range so that the leaf/node pass will skip these parts of the
+ * attr fork mappings.
+ */
+static void
+xrep_xattr_swap_rmt_owner(
+	struct xfs_attr_list_context	*context,
+	int				flags,
+	unsigned char			*name,
+	int				namelen,
+	int				valuelen)
+{
+	struct xfs_da_args		args = {
+		.op_flags		= XFS_DA_OP_NOTIME,
+		.attr_filter		= flags & XFS_ATTR_NSP_ONDISK_MASK,
+		.geo			= context->dp->i_mount->m_attr_geo,
+		.whichfork		= XFS_ATTR_FORK,
+		.dp			= context->dp,
+		.name			= name,
+		.namelen		= namelen,
+		.hashval		= xfs_da_hashname(name, namelen),
+		.trans			= context->tp,
+		.value			= NULL,
+		.valuelen		= 0,
+	};
+	LIST_HEAD(buffer_list);
+	struct xfs_bmbt_irec		map;
+	struct xrep_xattr_swap_owner	*xso;
+	struct xfs_mount		*mp = context->dp->i_mount;
+	struct xfs_attr3_rmt_hdr	*rmt;
+	struct xfs_buf			*bp;
+	void				*p;
+	xfs_daddr_t			dblkno;
+	int				dblkcnt;
+	int				nmap;
+	int				error;
+
+	xso = container_of(context, struct xrep_xattr_swap_owner, ctx);
+
+	if (flags & (XFS_ATTR_LOCAL | XFS_ATTR_INCOMPLETE))
+		return;
+
+	error = xfs_attr_get_ilocked(&args);
+	if (error)
+		goto fail;
+
+	/*
+	 * Mark this region of the attr fork so that the leaf/node scan will
+	 * skip this part.
+	 */
+	error = xbitmap_set(&xso->rmt_blocks, args.rmtblkno, args.rmtblkcnt);
+	if (error)
+		goto fail;
+
+	while (args.rmtblkcnt > 0) {
+		nmap = 1;
+		error = xfs_bmapi_read(args.dp, args.rmtblkno, args.rmtblkcnt,
+				&map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error || nmap != 1)
+			goto fail;
+
+		if (!xfs_bmap_is_real_extent(&map))
+			goto fail;
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+		error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp,
+				&xfs_attr3_rmt_buf_ops);
+		if (error)
+			goto fail;
+
+		/*
+		 * Each rmt block within the buffer gets its own header, so
+		 * update the owner for each header.
+		 */
+		for (p = bp->b_addr;
+		     p < bp->b_addr + BBTOB(bp->b_length);
+		     p += mp->m_attr_geo->blksize) {
+			rmt = p;
+			rmt->rm_owner = cpu_to_be64(xso->sc->ip->i_ino);
+		}
+
+		xfs_buf_delwri_queue(bp, &buffer_list);
+		xfs_buf_relse(bp);
+
+		/* roll attribute extent map forwards */
+		args.rmtblkno += map.br_blockcount;
+		args.rmtblkcnt -= map.br_blockcount;
+	}
+
+	/* Write the entire remote value to disk. */
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error)
+		goto fail;
+
+	return;
+fail:
+	xfs_buf_delwri_cancel(&buffer_list);
+	context->seen_enough = 1;
+}
+
+/*
+ * Change the owner field of every block in the attribute fork to match the
+ * file being repaired.  First we fix the remote value blocks (which have
+ * particular incore geometries) and then change the rest one block at a time.
+ */
+STATIC int
+xrep_xattr_swap_leaf_owner(
+	struct xrep_xattr_swap_owner	*xso)
+{
+	struct xfs_bmbt_irec		map;
+	struct xfs_da_geometry		*geo = xso->sc->mp->m_attr_geo;
+	struct xfs_scrub		*sc = xso->sc;
+	struct xfs_da3_blkinfo		*info;
+	struct xfs_buf			*bp;
+	xfs_fileoff_t			offset = 0;
+	xfs_fileoff_t			end = -1U;
+	xfs_dablk_t			dabno;
+	int				nmap;
+	int				error;
+
+	for (offset = 0;
+	     offset < end;
+	     offset = map.br_startoff + map.br_blockcount) {
+		nmap = 1;
+		error = xfs_bmapi_read(sc->tempip, offset, end - offset,
+				&map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		if (nmap != 1)
+			return -EFSCORRUPTED;
+		if (!xfs_bmap_is_real_extent(&map)) {
+			continue;
+		}
+
+		if (xbitmap_test(&xso->rmt_blocks, map.br_startoff,
+				 &map.br_blockcount)) {
+			continue;
+		}
+
+		for (dabno = round_up(map.br_startoff, geo->fsbcount);
+		     dabno < map.br_startoff + map.br_blockcount;
+		     dabno += geo->fsbcount) {
+			error = xfs_da_read_buf(sc->tp, sc->tempip,
+					dabno, 0, &bp, XFS_ATTR_FORK, NULL);
+			if (error)
+				return error;
+			if (!bp)
+				return -EFSCORRUPTED;
+
+			info = bp->b_addr;
+			info->owner = cpu_to_be64(sc->ip->i_ino);
+
+			/* If nobody set a buffer type or ops, set them now. */
+			if (bp->b_ops == NULL) {
+				switch (info->hdr.magic) {
+				case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+					bp->b_ops = &xfs_attr3_leaf_buf_ops;
+					break;
+				case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+					bp->b_ops = &xfs_da3_node_buf_ops;
+					break;
+				default:
+					xfs_trans_brelse(sc->tp, bp);
+					return -EFSCORRUPTED;
+				}
+				xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+			}
+
+			xfs_trans_ordered_buf(sc->tp, bp);
+			xfs_trans_brelse(sc->tp, bp);
+		}
+	}
+
+	return 0;
+}
+/*
+ * Walk the temporary file's xattr blocks, setting the owner field of each
+ * block to the new owner.  We use ordered and delwri buffers to flush
+ * everything out to disk ahead of comitting the atomic extent swap.  Rewriting
+ * the attr blocks like this is apparently safe because attr inactivation isn't
+ * picky about owner field enforcement(!)
+ */
+STATIC int
+xrep_xattr_swap_owner(
+	struct xfs_scrub		*sc)
+{
+	struct xrep_xattr_swap_owner	xso = {
+		.ctx.dp			= sc->tempip,
+		.ctx.resynch		= 1,
+		.ctx.put_listent	= xrep_xattr_swap_rmt_owner,
+		.ctx.allow_incomplete	= false,
+		.ctx.seen_enough	= 0,
+		.ctx.tp			= sc->tp,
+		.sc			= sc,
+	};
+	int				error;
+
+	xbitmap_init(&xso.rmt_blocks);
+
+	/* First pass -- change the owners of the remote blocks. */
+	error = xfs_attr_list_ilocked(&xso.ctx);
+	if (error)
+		goto out;
+	if (xso.ctx.seen_enough) {
+		error = -EFSCORRUPTED;
+		goto out;
+	}
+
+	/* Second pass -- change each attr leaf/node buffer. */
+	error = xrep_xattr_swap_leaf_owner(&xso);
+out:
+	xbitmap_destroy(&xso.rmt_blocks);
+	return error;
+}
+
+/*
+ * If both files' attribute structure are in short format, we can copy
+ * the short format data from the tempfile to the repaired file if it'll
+ * fit.
+ */
+STATIC void
+xrep_xattr_swap_local(
+	struct xfs_scrub	*sc,
+	int			newsize,
+	int			forkoff)
+{
+	struct xfs_ifork	*ifp1, *ifp2;
+
+	ifp1 = XFS_IFORK_PTR(sc->tempip, XFS_ATTR_FORK);
+	ifp2 = XFS_IFORK_PTR(sc->ip, XFS_ATTR_FORK);
+	sc->ip->i_d.di_forkoff = forkoff;
+
+	xfs_idata_realloc(sc->ip, ifp1->if_bytes - ifp2->if_bytes,
+			XFS_ATTR_FORK);
+
+	memcpy(ifp2->if_u1.if_data, ifp1->if_u1.if_data, newsize);
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/* Swap the temporary file's attribute fork with the one being repaired. */
+STATIC int
+xrep_xattr_swap(
+	struct xrep_xattr	*rx)
+{
+	struct xfs_scrub	*sc = rx->sc;
+	unsigned int		resblks;
+	bool			ip_local, temp_local;
+	int			error;
+
+	resblks = xfs_swap_range_calc_resblks(sc->tempip, sc->ip,
+			XFS_ATTR_FORK, XFS_MAX_FILEOFF);
+	error = xchk_trans_alloc(sc, max(1U, resblks));
+	if (error)
+		return error;
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip,
+			XFS_ILOCK_EXCL);
+	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+	ip_local = XFS_IFORK_FORMAT(sc->ip, XFS_ATTR_FORK) ==
+				XFS_DINODE_FMT_LOCAL;
+	temp_local = XFS_IFORK_FORMAT(sc->tempip, XFS_ATTR_FORK) ==
+				XFS_DINODE_FMT_LOCAL;
+
+	/*
+	 * If the both files have a local format attr fork and the rebuilt
+	 * xattr data would fit in the repaired file's attr fork, just copy
+	 * the contents from the tempfile and declare ourselves done.
+	 */
+	if (ip_local && temp_local) {
+		int	forkoff;
+		int	newsize;
+
+		newsize = XFS_ATTR_SF_TOTSIZE(sc->tempip);
+		forkoff = xfs_attr_shortform_bytesfit(sc->ip, newsize);
+		if (forkoff > 0) {
+			xrep_xattr_swap_local(sc, newsize, forkoff);
+			return 0;
+		}
+	}
+
+	/* Otherwise, make sure both attr forks are in block-mapping mode. */
+	error = xrep_xattr_swap_prep(sc, temp_local, ip_local);
+	if (error)
+		return error;
+
+	/* Rewrite the owner field of all attr blocks in the temporary file. */
+	error = xrep_xattr_swap_owner(sc);
+	if (error)
+		return error;
+
+	return xfs_swapext_atomic(&sc->tp, sc->tempip, sc->ip, XFS_ATTR_FORK,
+			0, 0, NULLFILEOFF, XFS_SWAPEXT_TO_SHORTFORM2);
+}
+
+/*
+ * Insert into the tempfile all the attributes that we collected.
  *
  * Commit the repair transaction and drop the ilock because the attribute
  * setting code needs to be able to allocate special transactions and take the
- * ilock on its own.  Some day we'll have deferred attribute setting, at which
- * point we'll be able to use that to replace the attributes atomically and
- * safely.
+ * ilock on its own.  The attributes are added to the temporary file (which can
+ * be disposed of easily on failure).  If we finish rebuilding all of the
+ * salvageable attrs, we can then use atomic extent swapping to commit the
+ * new attr index to the file.
  */
 STATIC int
 xrep_xattr_rebuild_tree(
 	struct xrep_xattr	*rx)
 {
+	uint64_t		nr_attrs = xfbma_length(rx->xattr_records);
 	int			error;
 
+	/* Nothing to salvage?  Zap the attr fork and finish. */
+	if (nr_attrs == 0) {
+		xfs_trans_ijoin(rx->sc->tp, rx->sc->ip, 0);
+		return xrep_xattr_reset_fork(rx->sc, rx->sc->ip);
+	}
+
 	/*
 	 * Commit the repair transaction and drop the ILOCK so that we can
 	 * use individual transactions to re-add each extended attribute.
@@ -772,8 +1163,29 @@ xrep_xattr_rebuild_tree(
 	if (error)
 		return error;
 
-	/* Re-add every attr to the file. */
-	return xfbma_iter_del(rx->xattr_records, xrep_xattr_insert_rec, rx);
+	/* Add every attr to the tempfile. */
+	error = xfbma_iter_del(rx->xattr_records, xrep_xattr_insert_rec, rx);
+	if (error)
+		return error;
+
+	/* Swap the tempfile's attr fork with the file being repaired. */
+	error = xrep_xattr_swap(rx);
+	if (error)
+		return error;
+
+	/*
+	 * Now wipe out the attr fork of the temp file so that regular inode
+	 * inactivation won't trip over the corrupt attr fork.  We're done
+	 * with the inode that we want to repair, so roll the transaction and
+	 * drop its ILOCK before we tackle the temporary file.
+	 */
+	error = xfs_trans_roll_inode(&rx->sc->tp, rx->sc->tempip);
+	if (error)
+		return error;
+	xfs_iunlock(rx->sc->ip, XFS_ILOCK_EXCL);
+	rx->sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+
+	return xrep_xattr_reset_fork(rx->sc, rx->sc->tempip);
 }
 
 /*
@@ -811,17 +1223,6 @@ xrep_xattr(
 	if (error)
 		goto out;
 
-	/*
-	 * Invalidate and truncate all attribute fork extents.  This is the
-	 * point at which we are no longer able to bail out gracefully.
-	 * We commit the transaction here because xfs_attr_set allocates its
-	 * own transactions.
-	 */
-	xfs_trans_ijoin(sc->tp, sc->ip, 0);
-	error = xrep_xattr_reset_fork(sc, sc->ip);
-	if (error)
-		goto out;
-
 	/* Now rebuild the attribute information. */
 	error = xrep_xattr_rebuild_tree(&rx);
 out:
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index a304a54997f9..25dfa1a4469e 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -382,3 +382,25 @@ xbitmap_count_set_regions(
 
 	return nr;
 }
+
+/* Is the start of the range set or clear?  And for how long? */
+bool
+xbitmap_test(
+	struct xbitmap		*bitmap,
+	uint64_t		start,
+	uint64_t		*len)
+{
+	struct xbitmap_node	*bn;
+	uint64_t		last = start + *len - 1;
+
+	bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+	if (!bn)
+		return false;
+	if (bn->bn_start <= start) {
+		if (bn->bn_last < last)
+			*len = bn->bn_last - start + 1;
+		return true;
+	}
+	*len = bn->bn_start - start;
+	return false;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 33548004f111..deb39528691e 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -39,5 +39,6 @@ int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
 
 bool xbitmap_empty(struct xbitmap *bitmap);
 uint64_t xbitmap_count_set_regions(struct xbitmap *bitmap);
+bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
 
 #endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index de6d1747a3fa..6a833ea58d0e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1597,7 +1597,7 @@ xfs_bmap_count_range_blocks(
  * Compute the number of blocks we need to reserve to handle a log-assisted
  * extent swap operation.
  */
-static inline unsigned int
+unsigned int
 xfs_swap_range_calc_resblks(
 	struct xfs_inode	*ip1,
 	struct xfs_inode	*ip2,
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index e0712c274dd2..1da6b4cdf0b4 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -81,4 +81,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
 			      xfs_off_t len);
 
+unsigned int xfs_swap_range_calc_resblks(struct xfs_inode *ip1,
+		struct xfs_inode *ip2, int whichfork, xfs_filblks_t blockcount);
+
 #endif	/* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 306cf86c353d..03b736bc054c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3843,7 +3843,8 @@ DEFINE_NAMESPACE_EVENT(xfs_imeta_dir_zap);
 
 #define XFS_SWAPEXT_FLAGS \
 	{ XFS_SWAP_EXTENT_ATTR_FORK,		"ATTRFORK" }, \
-	{ XFS_SWAP_EXTENT_SET_SIZES,		"SETSIZES" }
+	{ XFS_SWAP_EXTENT_SET_SIZES,		"SETSIZES" }, \
+	{ XFS_SWAP_EXTENT_TO_SHORTFORM2,	"TO_SHORTFORM2" }
 
 TRACE_EVENT(xfs_swapext_defer,
 	TP_PROTO(struct xfs_mount *mp, const struct xfs_swapext_intent *sxi),


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] xfs: use atomic extent swapping to repair directories
  2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
                   ` (3 preceding siblings ...)
  2020-04-29  2:46 ` [PATCH 4/5] xfs: use atomic extent swapping to repair extended attributes Darrick J. Wong
@ 2020-04-29  2:46 ` Darrick J. Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Darrick J. Wong @ 2020-04-29  2:46 UTC (permalink / raw)
  To: darrick.wong; +Cc: linux-xfs

From: Darrick J. Wong <darrick.wong@oracle.com>

When repairing a directory online, stage the new directory contents in a
temporary file and use the atomic extent swapping mechanism to commit
the results in bulk.  As a side effect of this patch, directory
inactivation will be able to purge any leftover dir blocks.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_swapext.c |   34 ++++
 fs/xfs/scrub/dir.c          |   12 +
 fs/xfs/scrub/dir_repair.c   |  415 +++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_inode.c          |   49 +++++
 4 files changed, 488 insertions(+), 22 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c
index f16f7d9a0b66..afdc516428bd 100644
--- a/fs/xfs/libxfs/xfs_swapext.c
+++ b/fs/xfs/libxfs/xfs_swapext.c
@@ -23,6 +23,7 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_leaf.h"
+#include "xfs_dir2_priv.h"
 
 /* Information to help us reset reflink flag / CoW fork state after a swap. */
 
@@ -233,6 +234,37 @@ xfs_swapext_attr_to_shortform2(
 	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
 }
 
+/* Convert inode2's block dir fork back to shortform, if possible.. */
+STATIC int
+xfs_swapext_dir_to_shortform2(
+	struct xfs_trans		*tp,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_da_args	args = {
+		.dp		= sxi->si_ip2,
+		.geo		= tp->t_mountp->m_dir_geo,
+		.whichfork	= XFS_DATA_FORK,
+		.trans		= tp,
+	};
+	struct xfs_dir2_sf_hdr	sfh;
+	struct xfs_buf		*bp;
+	int			size;
+	int			error;
+
+	if (!xfs_bmap_one_block(sxi->si_ip2, XFS_DATA_FORK))
+		return 0;
+
+	error = xfs_dir3_block_read(tp, sxi->si_ip2, &bp);
+	if (error)
+		return error;
+
+	size = xfs_dir2_block_sfsize(sxi->si_ip2, bp->b_addr, &sfh);
+	if (size > XFS_IFORK_DSIZE(sxi->si_ip2))
+		return 0;
+
+	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
+}
+
 #define XFS_SWAP_EXTENT_POST_PROCESSING (XFS_SWAP_EXTENT_TO_SHORTFORM2)
 
 /* Do we have more work to do to finish this operation? */
@@ -265,6 +297,8 @@ xfs_swapext_finish_one(
 		if (sxi->si_flags & XFS_SWAP_EXTENT_TO_SHORTFORM2) {
 			if (sxi->si_flags & XFS_SWAP_EXTENT_ATTR_FORK)
 				error = xfs_swapext_attr_to_shortform2(tp, sxi);
+			else if (S_ISDIR(VFS_I(sxi->si_ip2)->i_mode))
+				error = xfs_swapext_dir_to_shortform2(tp, sxi);
 			sxi->si_flags &= ~XFS_SWAP_EXTENT_TO_SHORTFORM2;
 			return error;
 		}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index e318dd46cb15..948b7440e591 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -18,6 +18,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/dabtree.h"
+#include "scrub/repair.h"
 
 /* Set us up to scrub directories. */
 int
@@ -28,6 +29,17 @@ xchk_setup_directory(
 	unsigned int		sz;
 	int			error;
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		if (!xfs_sb_version_hasatomicswap(&sc->mp->m_sb))
+			return -EOPNOTSUPP;
+
+		error = xrep_create_tempfile(sc, S_IFDIR);
+		if (error)
+			return error;
+	}
+#endif
+
 	if (sc->flags & XCHK_TRY_HARDER) {
 		error = xchk_fs_freeze(sc);
 		if (error)
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index b299f8b35ce4..3004505c55a9 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -25,6 +25,8 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_iwalk.h"
+#include "xfs_swapext.h"
+#include "xfs_bmap_util.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -78,6 +80,9 @@ struct xrep_dir {
 	 * found a good candidate.
 	 */
 	xfs_ino_t		parent_ino;
+
+	/* nlink value of the corrected directory. */
+	xfs_nlink_t		new_nlink;
 };
 
 /*
@@ -523,7 +528,6 @@ xrep_dir_reset_fork(
 	dp->i_d.di_size = 0;
 
 	/* Reinitialize the short form directory. */
-	set_nlink(VFS_I(dp), 2);
 	args->geo = sc->mp->m_dir_geo;
 	args->dp = dp;
 	args->trans = sc->tp;
@@ -610,10 +614,10 @@ xrep_dir_insert_rec(
 	if (error)
 		return error;
 
-	trace_xrep_dir_insert_rec(rd->sc->ip, namebuf, key->namelen, key->ino,
-			key->ftype);
+	trace_xrep_dir_insert_rec(rd->sc->tempip, namebuf, key->namelen,
+			key->ino, key->ftype);
 
-	error = xfs_qm_dqattach(rd->sc->ip);
+	error = xfs_qm_dqattach(rd->sc->tempip);
 	if (error)
 		return error;
 
@@ -626,18 +630,19 @@ xrep_dir_insert_rec(
 	if (error)
 		return error;
 
-	xfs_ilock(rd->sc->ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, rd->sc->ip, XFS_ILOCK_EXCL);
+	xfs_ilock(rd->sc->tempip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, rd->sc->tempip, XFS_ILOCK_EXCL);
 
 	name.len = key->namelen;
 	name.type = key->ftype;
-	error = xfs_dir_createname(tp, rd->sc->ip, &name, key->ino, resblks);
+	error = xfs_dir_createname(tp, rd->sc->tempip, &name, key->ino,
+			resblks);
 	if (error)
 		goto err;
 
 	if (name.type == XFS_DIR3_FT_DIR)
-		inc_nlink(VFS_I(rd->sc->ip));
-	xfs_trans_log_inode(tp, rd->sc->ip, XFS_ILOG_CORE);
+		rd->new_nlink++;
+	xfs_trans_log_inode(tp, rd->sc->tempip, XFS_ILOG_CORE);
 	return xfs_trans_commit(tp);
 
 err:
@@ -645,6 +650,356 @@ xrep_dir_insert_rec(
 	return error;
 }
 
+/*
+ * Prepare both inodes' directory forks for extent swapping.  Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format attr fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_dir_swap_prep(
+	struct xfs_scrub	*sc,
+	bool			temp_local,
+	bool			ip_local)
+{
+	int			error;
+
+	/*
+	 * If the tempfile's attributes are in shortform format, convert that
+	 * to a single leaf extent so that we can use the atomic extent swap.
+	 */
+	if (temp_local) {
+		struct xfs_da_args	args = {
+			.dp		= sc->tempip,
+			.geo		= sc->mp->m_dir_geo,
+			.whichfork	= XFS_DATA_FORK,
+			.trans		= sc->tp,
+			.total		= 1,
+		};
+
+		error = xfs_dir2_sf_to_block(&args);
+		if (error)
+			return error;
+
+		error = xfs_defer_finish(&sc->tp);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If the file being repaired had a shortform attribute fork, convert
+	 * that to an empty extent list in preparation for the atomic extent
+	 * swap.
+	 */
+	if (ip_local) {
+		struct xfs_ifork	*ifp;
+
+		sc->ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+		sc->ip->i_d.di_nextents = 0;
+
+		ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+		xfs_ifork_reset(ifp);
+		ifp->if_bytes = 0;
+		ifp->if_u1.if_root = NULL;
+		ifp->if_height = 0;
+		ifp->if_flags |= XFS_IFEXTENTS;
+
+		xfs_trans_log_inode(sc->tp, sc->ip,
+				XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	}
+
+	return 0;
+}
+
+/*
+ * Set the owner for this directory block to the directory being repaired.
+ * Return the magic number that we found, or the usual negative error.
+ */
+STATIC int
+xrep_dir_reset_owner(
+	struct xfs_scrub		*sc,
+	xfs_dablk_t			dabno,
+	struct xfs_buf			*bp,
+	unsigned int			*magic)
+{
+	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
+	struct xfs_dir3_data_hdr	*data3 = bp->b_addr;
+	struct xfs_da3_blkinfo		*info3 = bp->b_addr;
+	struct xfs_dir3_free_hdr	*free3 = bp->b_addr;
+	struct xfs_dir2_data_entry	*dep;
+
+	/* Directory data blocks. */
+	if (dabno < geo->leafblk) {
+		*magic = be32_to_cpu(data3->hdr.magic);
+		if (*magic != XFS_DIR3_BLOCK_MAGIC &&
+		    *magic != XFS_DIR3_DATA_MAGIC)
+			return -EFSCORRUPTED;
+
+		/*
+		 * If this is a block format directory, it's possible that the
+		 * block was created as part of converting the temp directory
+		 * from short format to block format in order to use the atomic
+		 * extent swap.  In that case, the '.' entry will be set to
+		 * the temp dir, so find the dot entry and reset it.
+		 */
+		if (*magic == XFS_DIR3_BLOCK_MAGIC) {
+			dep = bp->b_addr + geo->data_entry_offset;
+			if (dep->namelen != 1 || dep->name[0] != '.')
+				return -EFSCORRUPTED;
+
+			dep->inumber = cpu_to_be64(sc->ip->i_ino);
+		}
+
+		data3->hdr.owner = be64_to_cpu(sc->ip->i_ino);
+		return 0;
+	}
+
+	/* Directory leaf and da node blocks. */
+	if (dabno < geo->freeblk) {
+		*magic = be16_to_cpu(info3->hdr.magic);
+		switch (*magic) {
+		case XFS_DA3_NODE_MAGIC:
+		case XFS_DIR3_LEAF1_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			break;
+		default:
+			return -EFSCORRUPTED;
+		}
+
+		info3->owner = be64_to_cpu(sc->ip->i_ino);
+		return 0;
+	}
+
+	/* Directory free blocks. */
+	*magic = be32_to_cpu(free3->hdr.magic);
+	if (*magic != XFS_DIR3_FREE_MAGIC)
+		return -EFSCORRUPTED;
+
+	free3->hdr.owner = be64_to_cpu(sc->ip->i_ino);
+	return 0;
+}
+
+/*
+ * If the buffer didn't have buffer ops set, we need to set them now that we've
+ * dirtied the directory block.
+ */
+STATIC void
+xrep_dir_set_verifier(
+	unsigned int		magic,
+	struct xfs_buf		*bp)
+{
+	switch (magic) {
+	case XFS_DIR3_BLOCK_MAGIC:
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		break;
+	case XFS_DIR3_DATA_MAGIC:
+		bp->b_ops = &xfs_dir3_data_buf_ops;
+		break;
+	case XFS_DA3_NODE_MAGIC:
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		break;
+	case XFS_DIR3_LEAF1_MAGIC:
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		break;
+	case XFS_DIR3_LEAFN_MAGIC:
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		break;
+	case XFS_DIR3_FREE_MAGIC:
+		bp->b_ops = &xfs_dir3_free_buf_ops;
+		break;
+	}
+
+	xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+}
+
+/*
+ * Change the owner field of every block in the data fork to match the
+ * directory being repaired.
+ */
+STATIC int
+xrep_dir_swap_owner(
+	struct xfs_scrub		*sc)
+{
+	struct xfs_bmbt_irec		map;
+	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
+	struct xfs_buf			*bp;
+	xfs_fileoff_t			offset = 0;
+	xfs_fileoff_t			end = XFS_MAX_FILEOFF;
+	xfs_dablk_t			dabno;
+	int				nmap;
+	int				error;
+
+	for (offset = 0;
+	     offset < end;
+	     offset = map.br_startoff + map.br_blockcount) {
+		nmap = 1;
+		error = xfs_bmapi_read(sc->tempip, offset, end - offset,
+				&map, &nmap, 0);
+		if (error)
+			return error;
+		if (nmap != 1)
+			return -EFSCORRUPTED;
+		if (!xfs_bmap_is_real_extent(&map))
+			continue;
+
+
+		for (dabno = round_up(map.br_startoff, geo->fsbcount);
+		     dabno < map.br_startoff + map.br_blockcount;
+		     dabno += geo->fsbcount) {
+			unsigned int	magic;
+
+			error = xfs_da_read_buf(sc->tp, sc->tempip,
+					dabno, 0, &bp, XFS_DATA_FORK, NULL);
+			if (error)
+				return error;
+			if (!bp)
+				return -EFSCORRUPTED;
+
+			error = xrep_dir_reset_owner(sc, dabno, bp, &magic);
+			if (error) {
+				xfs_trans_brelse(sc->tp, bp);
+				return error;
+			}
+
+			if (bp->b_ops == NULL)
+				xrep_dir_set_verifier(magic, bp);
+
+			xfs_trans_ordered_buf(sc->tp, bp);
+			xfs_trans_brelse(sc->tp, bp);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * If both files' directory structure are in short format, we can copy
+ * the short format data from the tempfile to the repaired file if it'll
+ * fit.
+ */
+STATIC void
+xrep_dir_swap_local(
+	struct xfs_scrub	*sc,
+	int			newsize)
+{
+	struct xfs_ifork	*ifp1, *ifp2;
+
+	ifp1 = XFS_IFORK_PTR(sc->tempip, XFS_DATA_FORK);
+	ifp2 = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+
+	xfs_idata_realloc(sc->ip, ifp2->if_bytes - ifp1->if_bytes,
+			XFS_DATA_FORK);
+
+	memcpy(ifp2->if_u1.if_data, ifp1->if_u1.if_data, newsize);
+	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+struct xfs_name xfs_name_dot = { (unsigned char *)".", 1, XFS_DIR3_FT_DIR };
+
+/* Swap the temporary directory's data fork with the one being repaired. */
+STATIC int
+xrep_dir_swap(
+	struct xrep_dir		*rd)
+{
+	struct xfs_scrub	*sc = rd->sc;
+	unsigned int		resblks;
+	bool			ip_local, temp_local;
+	int			error;
+
+	resblks = xfs_swap_range_calc_resblks(sc->tempip, sc->ip,
+			XFS_DATA_FORK, XFS_MAX_FILEOFF);
+	error = xchk_trans_alloc(sc, max(1U, resblks));
+	if (error)
+		return error;
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip,
+			XFS_ILOCK_EXCL);
+	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
+	sc->ilock_flags |= XFS_ILOCK_EXCL;
+	xfs_trans_ijoin(sc->tp, sc->ip, 0);
+	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+	/*
+	 * Reset the temporary directory's '.' entry to point to the directory
+	 * we're repairing.  Note: shortform directories lack the dot entry.
+	 *
+	 * It's possible that this replacement could also expand a sf tempdir
+	 * into block format.
+	 */
+	if (XFS_IFORK_FORMAT(sc->tempip, XFS_DATA_FORK) !=
+			XFS_DINODE_FMT_LOCAL) {
+		error = xfs_dir_replace(sc->tp, sc->tempip, &xfs_name_dot,
+				sc->ip->i_ino, resblks);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Reset the temporary directory's '..' entry to point to the parent
+	 * that we found.  The temporary directory was created with the root
+	 * directory as the parent, so we can skip this if repairing a
+	 * subdirectory of the root.
+	 *
+	 * It's also possible that this replacement could also expand a sf
+	 * tempdir into block format.
+	 */
+	if (rd->parent_ino != sc->mp->m_rootip->i_ino) {
+		error = xfs_dir_replace(sc->tp, rd->sc->tempip,
+				&xfs_name_dotdot, rd->parent_ino, resblks);
+		if (error)
+			return error;
+	}
+
+	/* XXX: do we need to roll the transaction here? */
+
+	/*
+	 * Changing the dot and dotdot entries could have changed the shape of
+	 * the directory, so we recompute these.
+	 */
+	ip_local = XFS_IFORK_FORMAT(sc->ip, XFS_DATA_FORK) ==
+				XFS_DINODE_FMT_LOCAL;
+	temp_local = XFS_IFORK_FORMAT(sc->tempip, XFS_DATA_FORK) ==
+				XFS_DINODE_FMT_LOCAL;
+
+	/*
+	 * If the both files have a local format data fork and the rebuilt
+	 * directory data would fit in the repaired file's data fork, copy
+	 * the contents from the tempfile and declare ourselves done.
+	 */
+	if (ip_local && temp_local) {
+		if (sc->tempip->i_d.di_size <= XFS_IFORK_DSIZE(sc->ip)) {
+			xrep_dir_swap_local(sc, sc->tempip->i_d.di_size);
+			set_nlink(VFS_I(sc->ip), rd->new_nlink);
+			return 0;
+		}
+	}
+
+	/* Otherwise, make sure both data forks are in block-mapping mode. */
+	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
+	if (error)
+		return error;
+
+	/* Rewrite the owner field of all attr blocks in the temporary file. */
+	error = xrep_dir_swap_owner(sc);
+	if (error)
+		return error;
+
+	/*
+	 * Set nlink of the directory under repair to the number of
+	 * subdirectories that will be in the new directory data.  Do this in
+	 * the same transaction sequence that (atomically) commits the new
+	 * data.
+	 */
+	set_nlink(VFS_I(sc->ip), rd->new_nlink);
+
+	return xfs_swapext_atomic(&sc->tp, sc->tempip, sc->ip, XFS_DATA_FORK,
+			0, 0, NULLFILEOFF,
+			XFS_SWAPEXT_SET_SIZES | XFS_SWAPEXT_TO_SHORTFORM2);
+}
+
 /*
  * Insert all the attributes that we collected.
  *
@@ -669,6 +1024,10 @@ xrep_dir_rebuild_tree(
 	if (error)
 		return error;
 
+	/*
+	 * Drop the ILOCK so that we don't pin the tail of the log.  We still
+	 * hold the IOLOCK (aka i_rwsem) which will prevent directory access.
+	 */
 	xfs_iunlock(rd->sc->ip, XFS_ILOCK_EXCL);
 	rd->sc->ilock_flags &= ~XFS_ILOCK_EXCL;
 
@@ -680,8 +1039,30 @@ xrep_dir_rebuild_tree(
 	if (error)
 		return error;
 
-	/* Re-add every entry to the directory. */
-	return xfbma_iter_del(rd->dir_entries, xrep_dir_insert_rec, rd);
+	/* Re-add every entry to the temporary directory. */
+	error = xfbma_iter_del(rd->dir_entries, xrep_dir_insert_rec, rd);
+	if (error)
+		return error;
+
+	/* Swap the tempdir's data fork with the file being repaired. */
+	error = xrep_dir_swap(rd);
+	if (error)
+		return error;
+
+	/*
+	 * Now reset the data fork of the temp directory to an empty shortform
+	 * directory because inactivation does nothing for directories.  We're
+	 * done with the inode that we want to repair, so roll the transaction
+	 * and drop its ILOCK before we tackle the temporary file.
+	 */
+	error = xfs_trans_roll_inode(&rd->sc->tp, rd->sc->tempip);
+	if (error)
+		return error;
+	xfs_iunlock(rd->sc->ip, XFS_ILOCK_EXCL);
+	rd->sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+
+	return xrep_dir_reset_fork(rd->sc, rd->sc->tempip,
+			rd->sc->mp->m_rootip->i_ino);
 }
 
 /*
@@ -821,6 +1202,7 @@ xrep_dir(
 	struct xrep_dir		rd = {
 		.sc		= sc,
 		.parent_ino	= NULLFSINO,
+		.new_nlink	= 2,
 	};
 	int			error;
 
@@ -860,17 +1242,6 @@ xrep_dir(
 	if (error)
 		goto out;
 
-	/*
-	 * Invalidate and truncate all data fork extents.  This is the point at
-	 * which we are no longer able to bail out gracefully.  We commit the
-	 * transaction here because the rebuilding step allocates its own
-	 * transactions.
-	 */
-	xfs_trans_ijoin(sc->tp, sc->ip, 0);
-	error = xrep_dir_reset_fork(sc, sc->ip, rd.parent_ino);
-	if (error)
-		goto out;
-
 	/* Now rebuild the directory information. */
 	error = xrep_dir_rebuild_tree(&rd);
 out:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 080c8838fba5..301fb9afbfde 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -17,6 +17,7 @@
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_attr.h"
+#include "xfs_bit.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
@@ -1292,6 +1293,49 @@ xfs_release(
 	return 0;
 }
 
+/*
+ * Mark all the buffers attached to this directory stale.  In theory we should
+ * never be freeing a directory with any blocks at all, but this covers the
+ * case where we've recovered a directory swap with a "temporary" directory
+ * created by online repair and now need to dump it.
+ */
+STATIC void
+xfs_inactive_dir(
+	struct xfs_inode	*dp)
+{
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_da_geometry	*geo = mp->m_dir_geo;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	struct xfs_buf		*bp;
+	xfs_fileoff_t		off;
+
+	/*
+	 * Invalidate each directory block.  All directory blocks are of
+	 * fsbcount length and alignment, so we only need to walk those same
+	 * offsets.  We hold the only reference to this inode, so we must wait
+	 * for the buffer locks.
+	 */
+	for_each_xfs_iext(ifp, &icur, &got) {
+		for (off = round_up(got.br_startoff, geo->fsbcount);
+		     off < got.br_startoff + got.br_blockcount;
+		     off += geo->fsbcount) {
+			xfs_fsblock_t	fsbno;
+
+			fsbno = (off - got.br_startoff) + got.br_startblock;
+			bp = xfs_buf_incore(mp->m_ddev_targp,
+					XFS_FSB_TO_DADDR(mp, fsbno),
+					XFS_FSB_TO_BB(mp, geo->fsbcount),
+					XBF_SCAN_STALE);
+			if (bp) {
+				xfs_buf_stale(bp);
+				xfs_buf_relse(bp);
+			}
+		}
+	}
+}
+
 /*
  * xfs_inactive_truncate
  *
@@ -1694,6 +1738,11 @@ xfs_inactive(
 	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
 		truncate = 1;
 
+	if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_d.di_nextents > 0) {
+		xfs_inactive_dir(ip);
+		truncate = 1;
+	}
+
 	if (S_ISLNK(VFS_I(ip)->i_mode))
 		error = xfs_inactive_symlink(ip);
 	else if (truncate)


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-04-29  2:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-04-29  2:46 [PATCH RFCRAP 0/5] xfs: atomic file metadata repairs Darrick J. Wong
2020-04-29  2:46 ` [PATCH 1/5] xfs: parent repair should try the dcache first Darrick J. Wong
2020-04-29  2:46 ` [PATCH 2/5] xfs: create temporary files and directories for online repair Darrick J. Wong
2020-04-29  2:46 ` [PATCH 3/5] xfs: use atomic extent swapping to repair rt metadata Darrick J. Wong
2020-04-29  2:46 ` [PATCH 4/5] xfs: use atomic extent swapping to repair extended attributes Darrick J. Wong
2020-04-29  2:46 ` [PATCH 5/5] xfs: use atomic extent swapping to repair directories Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).