* [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
@ 2023-08-30 15:26 Darrick J. Wong
2023-08-30 22:29 ` Dave Chinner
` (7 more replies)
0 siblings, 8 replies; 15+ messages in thread
From: Darrick J. Wong @ 2023-08-30 15:26 UTC (permalink / raw)
To: Dave Chinner, Eric Sandeen; +Cc: xfs, shrikanth hegde, Ritesh Harjani
From: Darrick J. Wong <djwong@kernel.org>
shrikanth hegde reports that filesystems fail shortly after mount with
the following failure:
WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
From diagnostic data collected by the bug reporters, it would appear
that we cleanly mounted a filesystem that contained unlinked inodes.
Unlinked inodes are only processed as a final step of log recovery,
which means that clean mounts do not process the unlinked list at all.
Prior to the introduction of the incore unlinked lists, this wasn't a
problem because the unlink code would (very expensively) traverse the
entire ondisk metadata iunlink chain to keep things up to date.
However, the incore unlinked list code complains when it realizes that
it is out of sync with the ondisk metadata and shuts down the fs, which
is bad.
Ritesh proposed to solve this problem by unconditionally parsing the
unlinked lists at mount time, but this imposes a mount time cost for
every filesystem to catch something that should be very infrequent.
Instead, let's target the places where we can encounter a next_unlinked
pointer that refers to an inode that is not in cache, and load it into
cache.
Note: This patch does not address the problem of iget loading an inode
from the middle of the iunlink list and needing to set i_prev_unlinked
correctly.
Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
and actually return ENOLINK
---
fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
fs/xfs/xfs_trace.h | 25 +++++++++++++++++
2 files changed, 96 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6ee266be45d4..2942002560b5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* Caller can handle inode not being in memory. */
+ rcu_read_unlock();
+ return NULL;
+ }
/*
- * Inode not in memory or in RCU freeing limbo should not happen.
- * Warn about this and let the caller handle the failure.
+ * Inode in RCU freeing limbo should not happen. Warn about this and
+ * let the caller handle the failure.
*/
- if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ if (WARN_ON_ONCE(!ip->i_ino)) {
rcu_read_unlock();
return NULL;
}
@@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
ip = xfs_iunlink_lookup(pag, next_agino);
if (!ip)
- return -EFSCORRUPTED;
+ return -ENOLINK;
+
ip->i_prev_unlinked = prev_agino;
return 0;
}
@@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
return 0;
}
+/*
+ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
+ * to @prev_agino. Caller must hold the AGI to synchronize with other changes
+ * to the unlinked list.
+ */
+STATIC int
+xfs_iunlink_reload_next(
+ struct xfs_trans *tp,
+ struct xfs_buf *agibp,
+ xfs_agino_t prev_agino,
+ xfs_agino_t next_agino)
+{
+ struct xfs_perag *pag = agibp->b_pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *next_ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(next_agino != NULLAGINO);
+
+#ifdef DEBUG
+ rcu_read_lock();
+ next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
+ ASSERT(next_ip == NULL);
+ rcu_read_unlock();
+#endif
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
+ next_agino, pag->pag_agno);
+
+ /*
+ * Use an untrusted lookup to be cautious in case the AGI has been
+ * corrupted and now points at a free inode. That shouldn't happen,
+ * but we'd rather shut down now since we're already running in a weird
+ * situation.
+ */
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+ error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
+ if (error)
+ return error;
+
+ /* If this is not an unlinked inode, something is very wrong. */
+ if (VFS_I(next_ip)->i_nlink != 0) {
+ error = -EFSCORRUPTED;
+ goto rele;
+ }
+
+ next_ip->i_prev_unlinked = prev_agino;
+ trace_xfs_iunlink_reload_next(next_ip);
+rele:
+ ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ xfs_irele(next_ip);
+ return error;
+}
+
static int
xfs_iunlink_insert_inode(
struct xfs_trans *tp,
@@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
* inode.
*/
error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
if (error)
return error;
@@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
*/
error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
ip->i_next_unlinked);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
if (error)
return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 36bd42ed9ec8..f4e46bac9b91 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->new_ptr)
);
+TRACE_EVENT(xfs_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
@ 2023-08-30 22:29 ` Dave Chinner
2023-08-30 23:25 ` [PATCH 1/2] xfs_db: dump unlinked buckets Darrick J. Wong
` (6 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Dave Chinner @ 2023-08-30 22:29 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: Eric Sandeen, xfs, shrikanth hegde, Ritesh Harjani
On Wed, Aug 30, 2023 at 08:26:59AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> shrikanth hegde reports that filesystems fail shortly after mount with
> the following failure:
>
> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>
> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>
> From diagnostic data collected by the bug reporters, it would appear
> that we cleanly mounted a filesystem that contained unlinked inodes.
> Unlinked inodes are only processed as a final step of log recovery,
> which means that clean mounts do not process the unlinked list at all.
>
> Prior to the introduction of the incore unlinked lists, this wasn't a
> problem because the unlink code would (very expensively) traverse the
> entire ondisk metadata iunlink chain to keep things up to date.
> However, the incore unlinked list code complains when it realizes that
> it is out of sync with the ondisk metadata and shuts down the fs, which
> is bad.
>
> Ritesh proposed to solve this problem by unconditionally parsing the
> unlinked lists at mount time, but this imposes a mount time cost for
> every filesystem to catch something that should be very infrequent.
> Instead, let's target the places where we can encounter a next_unlinked
> pointer that refers to an inode that is not in cache, and load it into
> cache.
>
> Note: This patch does not address the problem of iget loading an inode
> from the middle of the iunlink list and needing to set i_prev_unlinked
> correctly.
>
> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> and actually return ENOLINK
> ---
> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> 2 files changed, 96 insertions(+), 4 deletions(-)
This version looks good.
Reviewed-by: Dave Chinner <dchinner@redhat.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 1/2] xfs_db: dump unlinked buckets
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
2023-08-30 22:29 ` Dave Chinner
@ 2023-08-30 23:25 ` Darrick J. Wong
2023-08-31 20:01 ` Bill O'Donnell
2023-08-30 23:25 ` [PATCH 2/2] xfs_db: create unlinked inodes Darrick J. Wong
` (5 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2023-08-30 23:25 UTC (permalink / raw)
To: Dave Chinner, Eric Sandeen; +Cc: xfs, shrikanth hegde, Ritesh Harjani
From: Darrick J. Wong <djwong@kernel.org>
Create a new command to dump the resource usage of files in the unlinked
buckets.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
db/Makefile | 2
db/command.c | 1
db/command.h | 1
db/iunlink.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++
libxfs/libxfs_api_defs.h | 1
man/man8/xfs_db.8 | 19 ++++
6 files changed, 227 insertions(+), 1 deletion(-)
create mode 100644 db/iunlink.c
diff --git a/db/Makefile b/db/Makefile
index 2f95f670..d00801ab 100644
--- a/db/Makefile
+++ b/db/Makefile
@@ -14,7 +14,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
fuzz.h obfuscate.h
-CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
+CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c iunlink.c namei.c \
timelimit.c
LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/command.c b/db/command.c
index 02f778b9..b4021c86 100644
--- a/db/command.c
+++ b/db/command.c
@@ -127,6 +127,7 @@ init_commands(void)
info_init();
inode_init();
input_init();
+ iunlink_init();
logres_init();
logformat_init();
io_init();
diff --git a/db/command.h b/db/command.h
index 498983ff..a89e7150 100644
--- a/db/command.h
+++ b/db/command.h
@@ -34,3 +34,4 @@ extern void info_init(void);
extern void btheight_init(void);
extern void timelimit_init(void);
extern void namei_init(void);
+extern void iunlink_init(void);
diff --git a/db/iunlink.c b/db/iunlink.c
new file mode 100644
index 00000000..303b5daf
--- /dev/null
+++ b/db/iunlink.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs.h"
+#include "command.h"
+#include "output.h"
+#include "init.h"
+
+static xfs_filblks_t
+count_rtblocks(
+ struct xfs_inode *ip)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ xfs_filblks_t count = 0;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ int error;
+
+ error = -libxfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error) {
+ dbprintf(
+_("could not read AG %u agino %u extents, err=%d\n"),
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ error);
+ return 0;
+ }
+
+ for_each_xfs_iext(ifp, &icur, &got)
+ if (!isnullstartblock(got.br_startblock))
+ count += got.br_blockcount;
+ return count;
+}
+
+static xfs_agino_t
+get_next_unlinked(
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ bool verbose)
+{
+ struct xfs_buf *ino_bp;
+ struct xfs_dinode *dip;
+ struct xfs_inode *ip;
+ xfs_ino_t ino;
+ xfs_agino_t ret;
+ int error;
+
+ ino = XFS_AGINO_TO_INO(mp, agno, agino);
+ error = -libxfs_iget(mp, NULL, ino, 0, &ip);
+ if (error)
+ goto bad;
+
+ if (verbose) {
+ xfs_filblks_t blocks, rtblks = 0;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ rtblks = count_rtblocks(ip);
+ blocks = ip->i_nblocks - rtblks;
+
+ dbprintf(_(" blocks %llu rtblocks %llu\n"),
+ blocks, rtblks);
+ } else {
+ dbprintf("\n");
+ }
+
+ error = -libxfs_imap_to_bp(mp, NULL, &ip->i_imap, &ino_bp);
+ if (error)
+ goto bad;
+
+ dip = xfs_buf_offset(ino_bp, ip->i_imap.im_boffset);
+ ret = be32_to_cpu(dip->di_next_unlinked);
+ libxfs_buf_relse(ino_bp);
+
+ return ret;
+bad:
+ dbprintf(_("AG %u agino %u: %s\n"), agno, agino, strerror(error));
+ return NULLAGINO;
+}
+
+static void
+dump_unlinked_bucket(
+ xfs_agnumber_t agno,
+ struct xfs_buf *agi_bp,
+ unsigned int bucket,
+ bool quiet,
+ bool verbose)
+{
+ struct xfs_agi *agi = agi_bp->b_addr;
+ xfs_agino_t agino;
+ unsigned int i = 0;
+
+ agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ if (agino != NULLAGINO)
+ dbprintf(_("AG %u bucket %u agino %u"), agno, bucket, agino);
+ else if (!quiet && agino == NULLAGINO)
+ dbprintf(_("AG %u bucket %u agino NULL\n"), agno, bucket);
+
+ while (agino != NULLAGINO) {
+ agino = get_next_unlinked(agno, agino, verbose);
+ if (agino != NULLAGINO)
+ dbprintf(_(" [%u] agino %u"), i++, agino);
+ else if (!quiet && agino == NULLAGINO)
+ dbprintf(_(" [%u] agino NULL\n"), i++);
+ }
+}
+
+static void
+dump_unlinked(
+ struct xfs_perag *pag,
+ unsigned int bucket,
+ bool quiet,
+ bool verbose)
+{
+ struct xfs_buf *agi_bp;
+ xfs_agnumber_t agno = pag->pag_agno;
+ int error;
+
+ error = -libxfs_ialloc_read_agi(pag, NULL, &agi_bp);
+ if (error) {
+ dbprintf(_("AGI %u: %s\n"), agno, strerror(errno));
+ return;
+ }
+
+ if (bucket != -1U) {
+ dump_unlinked_bucket(agno, agi_bp, bucket, quiet, verbose);
+ goto relse;
+ }
+
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+ dump_unlinked_bucket(agno, agi_bp, bucket, quiet, verbose);
+ }
+
+relse:
+ libxfs_buf_relse(agi_bp);
+}
+
+static int
+dump_iunlinked_f(
+ int argc,
+ char **argv)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+ unsigned int bucket = -1U;
+ bool quiet = false;
+ bool verbose = false;
+ int c;
+
+ while ((c = getopt(argc, argv, "a:b:qv")) != EOF) {
+ switch (c) {
+ case 'a':
+ agno = atoi(optarg);
+ if (agno >= mp->m_sb.sb_agcount) {
+ dbprintf(_("Unknown AG %u, agcount is %u.\n"),
+ agno, mp->m_sb.sb_agcount);
+ return 0;
+ }
+ break;
+ case 'b':
+ bucket = atoi(optarg);
+ if (bucket >= XFS_AGI_UNLINKED_BUCKETS) {
+ dbprintf(_("Unknown bucket %u, max is 63.\n"),
+ bucket);
+ return 0;
+ }
+ break;
+ case 'q':
+ quiet = true;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ dbprintf(_("Bad option for dump_iunlinked command.\n"));
+ return 0;
+ }
+ }
+
+ if (agno != NULLAGNUMBER) {
+ struct xfs_perag *pag = libxfs_perag_get(mp, agno);
+
+ dump_unlinked(pag, bucket, quiet, verbose);
+ libxfs_perag_put(pag);
+ return 0;
+ }
+
+ for_each_perag(mp, agno, pag)
+ dump_unlinked(pag, bucket, quiet, verbose);
+
+ return 0;
+}
+
+static const cmdinfo_t dump_iunlinked_cmd =
+ { "dump_iunlinked", NULL, dump_iunlinked_f, 0, -1, 0,
+ N_("[-a agno] [-b bucket] [-q] [-v]"),
+ N_("dump chain of unlinked inode buckets"), NULL };
+
+void
+iunlink_init(void)
+{
+ add_command(&dump_iunlinked_cmd);
+}
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 026aa510..ddba5c7c 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -125,6 +125,7 @@
#define xfs_idestroy_fork libxfs_idestroy_fork
#define xfs_iext_lookup_extent libxfs_iext_lookup_extent
#define xfs_ifork_zap_attr libxfs_ifork_zap_attr
+#define xfs_imap_to_bp libxfs_imap_to_bp
#define xfs_initialize_perag libxfs_initialize_perag
#define xfs_initialize_perag_data libxfs_initialize_perag_data
#define xfs_init_local_fork libxfs_init_local_fork
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 60dcdc52..2d6d0da4 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -579,6 +579,25 @@ print the current debug option bits. These are for the use of the implementor.
.BI "dquot [" \-g | \-p | \-u ] " id"
Set current address to a group, project or user quota block for the given ID. Defaults to user quota.
.TP
+.BI "dump_iunlinked [-a " agno " ] [-b " bucket " ] [-q] [-v]"
+Dump the contents of unlinked buckets.
+
+Options include:
+.RS 1.0i
+.TP 0.4i
+.B \-a
+Print only this AG's unlinked buckets.
+.TP 0.4i
+.B \-b
+Print only this bucket within each AGI.
+.TP 0.4i
+.B \-q
+Only print the essentials.
+.TP 0.4i
+.B \-v
+Print resource usage of each file on the unlinked lists.
+.RE
+.TP
.BI "echo [" arg "] ..."
Echo the arguments to the output.
.TP
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 2/2] xfs_db: create unlinked inodes
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
2023-08-30 22:29 ` Dave Chinner
2023-08-30 23:25 ` [PATCH 1/2] xfs_db: dump unlinked buckets Darrick J. Wong
@ 2023-08-30 23:25 ` Darrick J. Wong
2023-08-31 20:02 ` Bill O'Donnell
2023-08-30 23:26 ` [RFC PATCH] fstests: test unlinked inode list repair on demand Darrick J. Wong
` (4 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2023-08-30 23:25 UTC (permalink / raw)
To: Dave Chinner, Eric Sandeen; +Cc: xfs, shrikanth hegde, Ritesh Harjani
From: Darrick J. Wong <djwong@kernel.org>
Create an expert-mode debugger command to create unlinked inodes.
This will hopefully aid in simulation of leaked unlinked inode handling
in the kernel and elsewhere.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
db/iunlink.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++
libxfs/libxfs_api_defs.h | 1
man/man8/xfs_db.8 | 11 +++
3 files changed, 208 insertions(+)
diff --git a/db/iunlink.c b/db/iunlink.c
index 303b5daf..d87562e3 100644
--- a/db/iunlink.c
+++ b/db/iunlink.c
@@ -197,8 +197,204 @@ static const cmdinfo_t dump_iunlinked_cmd =
N_("[-a agno] [-b bucket] [-q] [-v]"),
N_("dump chain of unlinked inode buckets"), NULL };
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+iunlink_log_dinode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ int offset;
+ int error;
+
+ error = -libxfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ if (error)
+ return error;
+
+ dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = ip->i_imap.im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ libxfs_dinode_calc_crc(mp, dip);
+ libxfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+static int
+iunlink_insert_inode(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int offset;
+ int error;
+
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino))
+ return EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = iunlink_log_dinode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ }
+
+ /* Update the bucket. */
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ libxfs_trans_log_buf(tp, agibp, offset,
+ offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
+ */
+static int
+iunlink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_buf *agibp;
+ int error;
+
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+
+ pag = libxfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = -libxfs_read_agi(pag, tp, &agibp);
+ if (error)
+ goto out;
+
+ error = iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+ libxfs_perag_put(pag);
+ return error;
+}
+
+static int
+create_unlinked(
+ struct xfs_mount *mp)
+{
+ struct cred cr = { };
+ struct fsxattr fsx = { };
+ struct xfs_inode *ip;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_create_tmpfile, resblks,
+ 0, 0, &tp);
+ if (error) {
+ dbprintf(_("alloc trans: %s\n"), strerror(error));
+ return error;
+ }
+
+ error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG | 0600, 0, 0, &cr, &fsx,
+ &ip);
+ if (error) {
+ dbprintf(_("create inode: %s\n"), strerror(error));
+ goto out_cancel;
+ }
+
+ error = iunlink(tp, ip);
+ if (error) {
+ dbprintf(_("unlink inode: %s\n"), strerror(error));
+ goto out_rele;
+ }
+
+ error = -libxfs_trans_commit(tp);
+ if (error)
+ dbprintf(_("commit inode: %s\n"), strerror(error));
+
+ dbprintf(_("Created unlinked inode %llu in agno %u\n"),
+ (unsigned long long)ip->i_ino,
+ XFS_INO_TO_AGNO(mp, ip->i_ino));
+ libxfs_irele(ip);
+ return error;
+out_rele:
+ libxfs_irele(ip);
+out_cancel:
+ libxfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+iunlink_f(
+ int argc,
+ char **argv)
+{
+ int nr = 1;
+ int c;
+ int error;
+
+ while ((c = getopt(argc, argv, "n:")) != EOF) {
+ switch (c) {
+ case 'n':
+ nr = atoi(optarg);
+ if (nr <= 0) {
+ dbprintf(_("%s: need positive number\n"));
+ return 0;
+ }
+ break;
+ default:
+ dbprintf(_("Bad option for iunlink command.\n"));
+ return 0;
+ }
+ }
+
+ for (c = 0; c < nr; c++) {
+ error = create_unlinked(mp);
+ if (error)
+ return 1;
+ }
+
+ return 0;
+}
+
+static const cmdinfo_t iunlink_cmd =
+ { "iunlink", NULL, iunlink_f, 0, -1, 0,
+ N_("[-n nr]"),
+ N_("allocate inodes and put them on the unlinked list"), NULL };
+
void
iunlink_init(void)
{
add_command(&dump_iunlinked_cmd);
+ if (expert_mode)
+ add_command(&iunlink_cmd);
}
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index ddba5c7c..04277c00 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -149,6 +149,7 @@
#define xfs_prealloc_blocks libxfs_prealloc_blocks
#define xfs_read_agf libxfs_read_agf
+#define xfs_read_agi libxfs_read_agi
#define xfs_refc_block libxfs_refc_block
#define xfs_refcountbt_calc_reserves libxfs_refcountbt_calc_reserves
#define xfs_refcountbt_calc_size libxfs_refcountbt_calc_size
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 2d6d0da4..f53ddd67 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -840,6 +840,17 @@ Set the current inode number. If no
.I inode#
is given, print the current inode number.
.TP
+.BI "iunlink [-n " nr " ]"
+Allocate inodes and put them on the unlinked list.
+
+Options include:
+.RS 1.0i
+.TP 0.4i
+.B \-n
+Create this number of unlinked inodes.
+If not specified, 1 inode will be created.
+.RE
+.TP
.BI "label [" label ]
Set the filesystem label. The filesystem label can be used by
.BR mount (8)
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [RFC PATCH] fstests: test unlinked inode list repair on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
` (2 preceding siblings ...)
2023-08-30 23:25 ` [PATCH 2/2] xfs_db: create unlinked inodes Darrick J. Wong
@ 2023-08-30 23:26 ` Darrick J. Wong
2023-08-31 12:39 ` [PATCH v2] xfs: load uncached unlinked inodes into memory " Ritesh Harjani
` (3 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2023-08-30 23:26 UTC (permalink / raw)
To: Dave Chinner, Eric Sandeen; +Cc: xfs, shrikanth hegde, Ritesh Harjani
From: Darrick J. Wong <djwong@kernel.org>
Create a test to exercise recovery of unlinked inodes on a clean
filesystem. This was definitely possible on old kernels that on an ro
mount would clean the log without processing the iunlink list.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
common/rc | 4 +
tests/xfs/1872 | 89 +++++++++++++++++++++
tests/xfs/1872.out | 3 +
tests/xfs/1873 | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/xfs/1873.out | 6 +
5 files changed, 317 insertions(+), 1 deletion(-)
create mode 100755 tests/xfs/1872
create mode 100644 tests/xfs/1872.out
create mode 100755 tests/xfs/1873
create mode 100644 tests/xfs/1873.out
diff --git a/common/rc b/common/rc
index 45d488b2aa..982dd8c312 100644
--- a/common/rc
+++ b/common/rc
@@ -2674,9 +2674,11 @@ _require_xfs_io_command()
param_checked="$pwrite_opts $param"
;;
"scrub"|"repair")
- testio=`$XFS_IO_PROG -x -c "$command probe" $TEST_DIR 2>&1`
+ test -z "$param" && param="probe"
+ testio=`$XFS_IO_PROG -x -c "$command $param" $TEST_DIR 2>&1`
echo $testio | grep -q "Inappropriate ioctl" && \
_notrun "xfs_io $command support is missing"
+ param_checked="$param"
;;
"startupdate"|"commitupdate"|"cancelupdate")
$XFS_IO_PROG -f -c 'pwrite -S 0x58 0 128k -b 128k' $testfile > /dev/null
diff --git a/tests/xfs/1872 b/tests/xfs/1872
new file mode 100755
index 0000000000..0fd5fbcb70
--- /dev/null
+++ b/tests/xfs/1872
@@ -0,0 +1,89 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1872
+#
+# Test using runtime code to fix unlinked inodes on a clean filesystem that
+# never got cleaned up.
+#
+. ./common/preamble
+_begin_fstest auto quick unlink
+
+# Import common functions.
+source ./common/filter
+source ./common/fuzzy
+source ./common/quota
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_require_xfs_db_command iunlink
+_require_scratch_nocheck # we'll run repair ourselves
+
+# From the AGI definition
+XFS_AGI_UNLINKED_BUCKETS=64
+
+# Try to make each iunlink bucket have this many inodes in it.
+IUNLINK_BUCKETLEN=5
+
+# XXX Forcibly disable quota since quotacheck will break this test
+_qmount_option 'noquota'
+
+format_scratch() {
+ _scratch_mkfs -d agcount=1 | _filter_mkfs 2> "${tmp}.mkfs" >> $seqres.full
+ source "${tmp}.mkfs"
+ test "${agcount}" -eq 1 || _notrun "test requires 1 AG for error injection"
+
+ local nr_iunlinks="$((IUNLINK_BUCKETLEN * XFS_AGI_UNLINKED_BUCKETS))"
+ readarray -t BADINODES < <(_scratch_xfs_db -x -c "iunlink -n $nr_iunlinks" | awk '{print $4}')
+}
+
+__repair_check_scratch() {
+ _scratch_xfs_repair -o force_geometry -n 2>&1 | \
+ grep -E '(disconnected inode.*would move|next_unlinked in inode|unlinked bucket.*is.*in ag)'
+ return "${PIPESTATUS[0]}"
+}
+
+exercise_scratch() {
+ # Create a bunch of files...
+ declare -A inums
+ for ((i = 0; i < (XFS_AGI_UNLINKED_BUCKETS * 2); i++)); do
+ touch "${SCRATCH_MNT}/${i}" || break
+ inums["${i}"]="$(stat -c %i "${SCRATCH_MNT}/${i}")"
+ done
+
+ # ...then delete them to exercise the unlinked buckets
+ for ((i = 0; i < (XFS_AGI_UNLINKED_BUCKETS * 2); i++)); do
+ if ! rm -f "${SCRATCH_MNT}/${i}"; then
+ echo "rm failed on inum ${inums[$i]}"
+ break
+ fi
+ done
+}
+
+# Offline repair should not find anything
+final_check_scratch() {
+ __repair_check_scratch
+ res=$?
+ if [ $res -eq 2 ]; then
+ echo "scratch fs went offline?"
+ _scratch_mount
+ _scratch_unmount
+ __repair_check_scratch
+ fi
+ test "$res" -ne 0 && echo "repair returned $res?"
+}
+
+echo "+ Part 0: See if runtime can recover the unlinked list" | tee -a $seqres.full
+format_scratch
+_scratch_mount
+exercise_scratch
+_scratch_unmount
+final_check_scratch
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1872.out b/tests/xfs/1872.out
new file mode 100644
index 0000000000..0aa4b5ae5c
--- /dev/null
+++ b/tests/xfs/1872.out
@@ -0,0 +1,3 @@
+QA output created by 1872
++ Part 0: See if runtime can recover the unlinked list
+Silence is golden
diff --git a/tests/xfs/1873 b/tests/xfs/1873
new file mode 100755
index 0000000000..1e2f13f2c6
--- /dev/null
+++ b/tests/xfs/1873
@@ -0,0 +1,216 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Oracle. All Rights Reserved.
+#
+# FS QA Test No. 1873
+#
+# Functional test of using online repair to fix unlinked inodes on a clean
+# filesystem that never got cleaned up.
+#
+. ./common/preamble
+_begin_fstest auto online_repair
+
+# Import common functions.
+source ./common/filter
+source ./common/fuzzy
+source ./common/quota
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs generic
+_require_xfs_db_command iunlink
+# The iunlink bucket repair code wasn't added to the AGI repair code
+# until after the directory repair code was merged
+_require_xfs_io_command repair -R directory
+_require_scratch_nocheck # repair doesn't like single-AG fs
+
+# From the AGI definition
+XFS_AGI_UNLINKED_BUCKETS=64
+
+# Try to make each iunlink bucket have this many inodes in it.
+IUNLINK_BUCKETLEN=5
+
+# XXX Forcibly disable quota since quotacheck will break this test
+_qmount_option 'noquota'
+
+format_scratch() {
+ _scratch_mkfs -d agcount=1 | _filter_mkfs 2> "${tmp}.mkfs" >> $seqres.full
+ source "${tmp}.mkfs"
+ test "${agcount}" -eq 1 || _notrun "test requires 1 AG for error injection"
+
+ local nr_iunlinks="$((IUNLINK_BUCKETLEN * XFS_AGI_UNLINKED_BUCKETS))"
+ readarray -t BADINODES < <(_scratch_xfs_db -x -c "iunlink -n $nr_iunlinks" | awk '{print $4}')
+}
+
+__repair_check_scratch() {
+ _scratch_xfs_repair -o force_geometry -n 2>&1 | \
+ grep -E '(disconnected inode.*would move|next_unlinked in inode|unlinked bucket.*is.*in ag)'
+ return "${PIPESTATUS[0]}"
+}
+
+corrupt_scratch() {
+ # How far into the iunlink bucket chain do we target inodes for corruption?
+ # 1 = target the inode pointed to by the AGI
+ # 3 = middle of bucket list
+ # 5 = last element in bucket
+ local corruption_bucket_depth="$1"
+ if ((corruption_bucket_depth < 1 || corruption_bucket_depth > IUNLINK_BUCKETLEN)); then
+ echo "${corruption_bucket_depth}: Value must be between 1 and ${IUNLINK_BUCKETLEN}."
+ return 1
+ fi
+
+ # Index of the inode numbers within BADINODES
+ local bad_ino1_idx=$(( (IUNLINK_BUCKETLEN - corruption_bucket_depth) * XFS_AGI_UNLINKED_BUCKETS))
+ local bad_ino2_idx=$((bad_ino1_idx + 1))
+
+ # Inode numbers to target
+ local bad_ino1="${BADINODES[bad_ino1_idx]}"
+ local bad_ino2="${BADINODES[bad_ino2_idx]}"
+ printf "bad: 0x%x 0x%x\n" "${bad_ino1}" "${bad_ino2}" | _tee_kernlog >> $seqres.full
+
+ # Bucket within AGI 0's iunlinked array.
+ local ino1_bucket="$((bad_ino1 % XFS_AGI_UNLINKED_BUCKETS))"
+ local ino2_bucket="$((bad_ino2 % XFS_AGI_UNLINKED_BUCKETS))"
+
+ # The first bad inode stays on the unlinked list but gets a nonzero
+ # nlink; the second bad inode is removed from the unlinked list but
+ # keeps its zero nlink
+ _scratch_xfs_db -x \
+ -c "inode ${bad_ino1}" -c "write -d core.nlinkv2 5555" \
+ -c "agi 0" -c "fuzz -d unlinked[${ino2_bucket}] ones" -c "print unlinked" >> $seqres.full
+
+ local iwatch=()
+ local idx
+
+ # Make a list of the adjacent iunlink bucket inodes for the first inode
+ # that we targeted.
+ if [ "${corruption_bucket_depth}" -gt 1 ]; then
+ # Previous ino in bucket
+ idx=$(( (IUNLINK_BUCKETLEN - corruption_bucket_depth + 1) * XFS_AGI_UNLINKED_BUCKETS))
+ iwatch+=("${BADINODES[idx]}")
+ fi
+ iwatch+=("${bad_ino1}")
+ if [ "$((corruption_bucket_depth + 1))" -lt "${IUNLINK_BUCKETLEN}" ]; then
+ # Next ino in bucket
+ idx=$(( (IUNLINK_BUCKETLEN - corruption_bucket_depth - 1) * XFS_AGI_UNLINKED_BUCKETS))
+ iwatch+=("${BADINODES[idx]}")
+ fi
+
+ # Make a list of the adjacent iunlink bucket inodes for the second
+ # inode that we targeted.
+ if [ "${corruption_bucket_depth}" -gt 1 ]; then
+ # Previous ino in bucket
+ idx=$(( (IUNLINK_BUCKETLEN - corruption_bucket_depth + 1) * XFS_AGI_UNLINKED_BUCKETS))
+ iwatch+=("${BADINODES[idx + 1]}")
+ fi
+ iwatch+=("${bad_ino2}")
+ if [ "$((corruption_bucket_depth + 1))" -lt "${IUNLINK_BUCKETLEN}" ]; then
+ # Next ino in bucket
+ idx=$(( (IUNLINK_BUCKETLEN - corruption_bucket_depth - 1) * XFS_AGI_UNLINKED_BUCKETS))
+ iwatch+=("${BADINODES[idx + 1]}")
+ fi
+
+ # Construct a grep string for tracepoints.
+ GREP_STR="(xrep_attempt|xrep_done|bucket ${ino1_bucket} |bucket ${ino2_bucket} |bucket ${fuzz_bucket} "
+ GREP_STR="(xrep_attempt|xrep_done|bucket ${ino1_bucket} |bucket ${ino2_bucket} "
+ for ino in "${iwatch[@]}"; do
+ f="$(printf "|ino 0x%x" "${ino}")"
+ GREP_STR="${GREP_STR}${f}"
+ done
+ GREP_STR="${GREP_STR})"
+ echo "grep -E \"${GREP_STR}\"" >> $seqres.full
+
+ # Dump everything we did to to the full file.
+ local db_dump=(-c 'agi 0' -c 'print unlinked')
+ db_dump+=(-c 'addr root' -c 'print')
+ test "${ino1_bucket}" -gt 0 && \
+ db_dump+=(-c "dump_iunlinked -a 0 -b $((ino1_bucket - 1))")
+ db_dump+=(-c "dump_iunlinked -a 0 -b ${ino1_bucket}")
+ db_dump+=(-c "dump_iunlinked -a 0 -b ${ino2_bucket}")
+ test "${ino2_bucket}" -lt 63 && \
+ db_dump+=(-c "dump_iunlinked -a 0 -b $((ino2_bucket + 1))")
+ db_dump+=(-c "inode $bad_ino1" -c 'print core.nlinkv2 v3.inumber next_unlinked')
+ db_dump+=(-c "inode $bad_ino2" -c 'print core.nlinkv2 v3.inumber next_unlinked')
+ _scratch_xfs_db "${db_dump[@]}" >> $seqres.full
+
+ # Test run of repair to make sure we find disconnected inodes
+ __repair_check_scratch | \
+ sed -e 's/disconnected inode \([0-9]*\)/disconnected inode XXXXXX/g' \
+ -e 's/next_unlinked in inode \([0-9]*\)/next_unlinked in inode XXXXXX/g' \
+ -e 's/unlinked bucket \([0-9]*\) is \([0-9]*\) in ag \([0-9]*\) .inode=\([0-9]*\)/unlinked bucket YY is XXXXXX in ag Z (inode=AAAAAA/g' | \
+ uniq -c >> $seqres.full
+ res=${PIPESTATUS[0]}
+ test "$res" -ne 0 || echo "repair returned $res after corruption?"
+}
+
+exercise_scratch() {
+ # Create a bunch of files...
+ declare -A inums
+ for ((i = 0; i < (XFS_AGI_UNLINKED_BUCKETS * 2); i++)); do
+ touch "${SCRATCH_MNT}/${i}" || break
+ inums["${i}"]="$(stat -c %i "${SCRATCH_MNT}/${i}")"
+ done
+
+ # ...then delete them to exercise the unlinked buckets
+ for ((i = 0; i < (XFS_AGI_UNLINKED_BUCKETS * 2); i++)); do
+ if ! rm -f "${SCRATCH_MNT}/${i}"; then
+ echo "rm failed on inum ${inums[$i]}"
+ break
+ fi
+ done
+}
+
+# Offline repair should not find anything
+final_check_scratch() {
+ __repair_check_scratch
+ res=$?
+ if [ $res -eq 2 ]; then
+ echo "scratch fs went offline?"
+ _scratch_mount
+ _scratch_unmount
+ __repair_check_scratch
+ fi
+ test "$res" -ne 0 && echo "repair returned $res?"
+}
+
+echo "+ Part 1: See if scrub can recover the unlinked list" | tee -a $seqres.full
+format_scratch
+_kernlog "no bad inodes"
+_scratch_mount
+_scratch_scrub >> $seqres.full
+exercise_scratch
+_scratch_unmount
+final_check_scratch
+
+echo "+ Part 2: Corrupt the first inode in the bucket" | tee -a $seqres.full
+format_scratch
+corrupt_scratch 1
+_scratch_mount
+_scratch_scrub >> $seqres.full
+exercise_scratch
+_scratch_unmount
+final_check_scratch
+
+echo "+ Part 3: Corrupt the middle inode in the bucket" | tee -a $seqres.full
+format_scratch
+corrupt_scratch 3
+_scratch_mount
+_scratch_scrub >> $seqres.full
+exercise_scratch
+_scratch_unmount
+final_check_scratch
+
+echo "+ Part 4: Corrupt the last inode in the bucket" | tee -a $seqres.full
+format_scratch
+corrupt_scratch 5
+_scratch_mount
+_scratch_scrub >> $seqres.full
+exercise_scratch
+_scratch_unmount
+final_check_scratch
+
+# success, all done
+echo Silence is golden
+status=0
+exit
diff --git a/tests/xfs/1873.out b/tests/xfs/1873.out
new file mode 100644
index 0000000000..0e36bd2304
--- /dev/null
+++ b/tests/xfs/1873.out
@@ -0,0 +1,6 @@
+QA output created by 1873
++ Part 1: See if scrub can recover the unlinked list
++ Part 2: Corrupt the first inode in the bucket
++ Part 3: Corrupt the middle inode in the bucket
++ Part 4: Corrupt the last inode in the bucket
+Silence is golden
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
` (3 preceding siblings ...)
2023-08-30 23:26 ` [RFC PATCH] fstests: test unlinked inode list repair on demand Darrick J. Wong
@ 2023-08-31 12:39 ` Ritesh Harjani
2023-08-31 20:39 ` Eric Sandeen
2023-08-31 17:43 ` Eric Sandeen
` (2 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Ritesh Harjani @ 2023-08-31 12:39 UTC (permalink / raw)
To: Darrick J. Wong, Dave Chinner, Eric Sandeen; +Cc: xfs, shrikanth hegde
"Darrick J. Wong" <djwong@kernel.org> writes:
> From: Darrick J. Wong <djwong@kernel.org>
>
> shrikanth hegde reports that filesystems fail shortly after mount with
> the following failure:
>
> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>
> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>
> From diagnostic data collected by the bug reporters, it would appear
> that we cleanly mounted a filesystem that contained unlinked inodes.
> Unlinked inodes are only processed as a final step of log recovery,
> which means that clean mounts do not process the unlinked list at all.
>
> Prior to the introduction of the incore unlinked lists, this wasn't a
> problem because the unlink code would (very expensively) traverse the
> entire ondisk metadata iunlink chain to keep things up to date.
> However, the incore unlinked list code complains when it realizes that
> it is out of sync with the ondisk metadata and shuts down the fs, which
> is bad.
>
> Ritesh proposed to solve this problem by unconditionally parsing the
> unlinked lists at mount time, but this imposes a mount time cost for
> every filesystem to catch something that should be very infrequent.
> Instead, let's target the places where we can encounter a next_unlinked
> pointer that refers to an inode that is not in cache, and load it into
> cache.
>
> Note: This patch does not address the problem of iget loading an inode
> from the middle of the iunlink list and needing to set i_prev_unlinked
> correctly.
>
> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> and actually return ENOLINK
> ---
> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> 2 files changed, 96 insertions(+), 4 deletions(-)
Hi Darrick,
Thanks for taking a look at this. I tested this patch on the setup where
Shrikanth earlier saw the crash. I still can see a problem. I saw it is
taking the branch from
+ /* If this is not an unlinked inode, something is very wrong. */
+ if (VFS_I(next_ip)->i_nlink != 0) {
+ error = -EFSCORRUPTED;
+ goto rele;
+ }
Here are the logs of reference -
[ 21.399573] XFS (dm-0): Found unrecovered unlinked inode 0x2ec44d in AG 0x0. Initiating recovery.
[ 21.400150] XFS (dm-0): Internal error xfs_trans_cancel at line 1104 of file fs/xfs/xfs_trans.c. Caller xfs_remove+0x1a0/0x310 [xfs]
[ 21.400222] CPU: 0 PID: 1629 Comm: systemd-tmpfile Not tainted 6.5.0+ #2
[ 21.400226] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1010.22 (NH1010_122) hv:phyp pSeries
[ 21.400230] Call Trace:
[ 21.400231] [c000000014cdbb70] [c000000000f377b8] dump_stack_lvl+0x6c/0x9c (unreliable)
[ 21.400239] [c000000014cdbba0] [c008000000f7c204] xfs_error_report+0x5c/0x80 [xfs]
[ 21.400303] [c000000014cdbc00] [c008000000fab320] xfs_trans_cancel+0x178/0x1b0 [xfs]
[ 21.400371] [c000000014cdbc50] [c008000000f999d8] xfs_remove+0x1a0/0x310 [xfs]
[ 21.400432] [c000000014cdbcc0] [c008000000f93eb0] xfs_vn_unlink+0x68/0xf0 [xfs]
[ 21.400493] [c000000014cdbd20] [c0000000005b8038] vfs_rmdir+0x178/0x300
[ 21.400498] [c000000014cdbd60] [c0000000005be444] do_rmdir+0x124/0x240
[ 21.400502] [c000000014cdbdf0] [c0000000005be594] sys_rmdir+0x34/0x50
[ 21.400506] [c000000014cdbe10] [c000000000033c38] system_call_exception+0x148/0x3a0
[ 21.400511] [c000000014cdbe50] [c00000000000c6d4] system_call_common+0xf4/0x258
[ 21.400515] --- interrupt: c00 at 0x7fff9ad230d4
[ 21.400518] NIP: 00007fff9ad230d4 LR: 00007fff9b12ff74 CTR: 0000000000000000
[ 21.400521] REGS: c000000014cdbe80 TRAP: 0c00 Not tainted (6.5.0+)
[ 21.400524] MSR: 800000000280f033 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE> CR: 28004204 XER: 00000000
[ 21.400534] IRQMASK: 0
[ 21.400534] GPR00: 0000000000000028 00007fffc57f9aa0 00007fff9ae07300 000000014e651df0
[ 21.400534] GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 21.400534] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 21.400534] GPR12: 0000000000000000 00007fff9b4f5e60 000000013ebbee38 000000013ebc05d8
[ 21.400534] GPR16: 000000013ebbee60 000000013ebc0818 000000013ebbee70 000000013ebbfe68
[ 21.400534] GPR20: 000000013ebe003b 000000013ebe0051 000000013ebe003c 0000000000000000
[ 21.400534] GPR24: 000000013ebc07d8 000000014e6496c0 000000013ebe0078 00007fffc57f9c80
[ 21.400534] GPR28: 0000000000000000 000000014e651df0 0000000000000000 000000000000000e
[ 21.400569] NIP [00007fff9ad230d4] 0x7fff9ad230d4
[ 21.400571] LR [00007fff9b12ff74] 0x7fff9b12ff74
[ 21.400573] --- interrupt: c00
[ 21.402551] XFS (dm-0): Corruption of in-memory data (0x8) detected at xfs_trans_cancel+0x190/0x1b0 [xfs] (fs/xfs/xfs_trans.c:1105). Shutting down filesystem.
[ 21.402615] XFS (dm-0): Please unmount the filesystem and rectify the problem(s)
Do you know any possible reason/explaination on what could be going wrong?
>
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 6ee266be45d4..2942002560b5 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
>
> rcu_read_lock();
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> + if (!ip) {
> + /* Caller can handle inode not being in memory. */
> + rcu_read_unlock();
> + return NULL;
> + }
>
> /*
> - * Inode not in memory or in RCU freeing limbo should not happen.
> - * Warn about this and let the caller handle the failure.
> + * Inode in RCU freeing limbo should not happen. Warn about this and
> + * let the caller handle the failure.
> */
> - if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
> + if (WARN_ON_ONCE(!ip->i_ino)) {
> rcu_read_unlock();
> return NULL;
> }
> @@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
>
> ip = xfs_iunlink_lookup(pag, next_agino);
> if (!ip)
> - return -EFSCORRUPTED;
> + return -ENOLINK;
> +
> ip->i_prev_unlinked = prev_agino;
> return 0;
> }
> @@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
> return 0;
> }
>
> +/*
> + * Load the inode @next_agino into the cache and set its prev_unlinked pointer
> + * to @prev_agino. Caller must hold the AGI to synchronize with other changes
> + * to the unlinked list.
> + */
> +STATIC int
> +xfs_iunlink_reload_next(
> + struct xfs_trans *tp,
> + struct xfs_buf *agibp,
> + xfs_agino_t prev_agino,
> + xfs_agino_t next_agino)
> +{
> + struct xfs_perag *pag = agibp->b_pag;
> + struct xfs_mount *mp = pag->pag_mount;
> + struct xfs_inode *next_ip = NULL;
> + xfs_ino_t ino;
> + int error;
> +
> + ASSERT(next_agino != NULLAGINO);
> +
> +#ifdef DEBUG
> + rcu_read_lock();
> + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
> + ASSERT(next_ip == NULL);
> + rcu_read_unlock();
> +#endif
> +
> + xfs_info_ratelimited(mp,
> + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
> + next_agino, pag->pag_agno);
> +
> + /*
> + * Use an untrusted lookup to be cautious in case the AGI has been
> + * corrupted and now points at a free inode. That shouldn't happen,
> + * but we'd rather shut down now since we're already running in a weird
> + * situation.
> + */
> + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
> + if (error)
> + return error;
> +
> + /* If this is not an unlinked inode, something is very wrong. */
> + if (VFS_I(next_ip)->i_nlink != 0) {
> + error = -EFSCORRUPTED;
> + goto rele;
> + }
> +
> + next_ip->i_prev_unlinked = prev_agino;
> + trace_xfs_iunlink_reload_next(next_ip);
> +rele:
> + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
> + xfs_irele(next_ip);
> + return error;
> +}
> +
> static int
> xfs_iunlink_insert_inode(
> struct xfs_trans *tp,
> @@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
> * inode.
> */
> error = xfs_iunlink_update_backref(pag, agino, next_agino);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
> if (error)
> return error;
>
> @@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
> */
> error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
> ip->i_next_unlinked);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
> + ip->i_next_unlinked);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 36bd42ed9ec8..f4e46bac9b91 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> __entry->new_ptr)
> );
>
> +TRACE_EVENT(xfs_iunlink_reload_next,
> + TP_PROTO(struct xfs_inode *ip),
> + TP_ARGS(ip),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_agnumber_t, agno)
> + __field(xfs_agino_t, agino)
> + __field(xfs_agino_t, prev_agino)
> + __field(xfs_agino_t, next_agino)
> + ),
> + TP_fast_assign(
> + __entry->dev = ip->i_mount->m_super->s_dev;
> + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> + __entry->prev_agino = ip->i_prev_unlinked;
> + __entry->next_agino = ip->i_next_unlinked;
> + ),
> + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->agno,
> + __entry->agino,
> + __entry->prev_agino,
> + __entry->next_agino)
> +);
> +
> DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> TP_PROTO(struct xfs_inode *ip),
> TP_ARGS(ip),
-ritesh
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
` (4 preceding siblings ...)
2023-08-31 12:39 ` [PATCH v2] xfs: load uncached unlinked inodes into memory " Ritesh Harjani
@ 2023-08-31 17:43 ` Eric Sandeen
2023-08-31 18:10 ` Bill O'Donnell
2023-09-01 14:31 ` Bill O'Donnell
7 siblings, 0 replies; 15+ messages in thread
From: Eric Sandeen @ 2023-08-31 17:43 UTC (permalink / raw)
To: Darrick J. Wong, Dave Chinner, Eric Sandeen
Cc: xfs, shrikanth hegde, Ritesh Harjani
On 8/30/23 10:26 AM, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> shrikanth hegde reports that filesystems fail shortly after mount with
> the following failure:
>
> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>
> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>
> From diagnostic data collected by the bug reporters, it would appear
> that we cleanly mounted a filesystem that contained unlinked inodes.
> Unlinked inodes are only processed as a final step of log recovery,
> which means that clean mounts do not process the unlinked list at all.
Thank you for working on this Darrick!
I'll point out that at least one way to end up in this situation is to
have at one point run a very old kernel which did not contain this
commit, merged in kernel v4.14:
commit 6f4a1eefdd0ad4561543270a7fceadabcca075dd
Author: Eric Sandeen <sandeen@sandeen.net>
Date: Tue Aug 8 18:21:49 2017 -0700
xfs: toggle readonly state around xfs_log_mount_finish
When we do log recovery on a readonly mount, unlinked inode
processing does not happen due to the readonly checks in
xfs_inactive(), which are trying to prevent any I/O on a
readonly mount.
This is misguided - we do I/O on readonly mounts all the time,
for consistency; for example, log recovery. So do the same
RDONLY flag twiddling around xfs_log_mount_finish() as we
do around xfs_log_mount(), for the same reason.
This all cries out for a big rework but for now this is a
simple fix to an obvious problem.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
so if you:
1) Crash with unlinked inodes
2) mount -o ro <recovers log but skips unlinked inode recovery>
3) mount -o remount,rw
4) umount <writes clean log record>
you now have a filesystem with on-disk unlinked inodes and a clean log,
and those inodes won't get cleaned up until log recovery runs again or
xfs_repair is run.
And in testing an old OS (RHEL7) it does seem that the root filesystem
goes through a mount -o ro, mount -o remount,rw transition at boot time.
So this situation may be somewhat common.
-Eric
> Prior to the introduction of the incore unlinked lists, this wasn't a
> problem because the unlink code would (very expensively) traverse the
> entire ondisk metadata iunlink chain to keep things up to date.
> However, the incore unlinked list code complains when it realizes that
> it is out of sync with the ondisk metadata and shuts down the fs, which
> is bad.
>
> Ritesh proposed to solve this problem by unconditionally parsing the
> unlinked lists at mount time, but this imposes a mount time cost for
> every filesystem to catch something that should be very infrequent.
> Instead, let's target the places where we can encounter a next_unlinked
> pointer that refers to an inode that is not in cache, and load it into
> cache.
>
> Note: This patch does not address the problem of iget loading an inode
> from the middle of the iunlink list and needing to set i_prev_unlinked
> correctly.
>
> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> and actually return ENOLINK
> ---
> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> 2 files changed, 96 insertions(+), 4 deletions(-)
>
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 6ee266be45d4..2942002560b5 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
>
> rcu_read_lock();
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> + if (!ip) {
> + /* Caller can handle inode not being in memory. */
> + rcu_read_unlock();
> + return NULL;
> + }
>
> /*
> - * Inode not in memory or in RCU freeing limbo should not happen.
> - * Warn about this and let the caller handle the failure.
> + * Inode in RCU freeing limbo should not happen. Warn about this and
> + * let the caller handle the failure.
> */
> - if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
> + if (WARN_ON_ONCE(!ip->i_ino)) {
> rcu_read_unlock();
> return NULL;
> }
> @@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
>
> ip = xfs_iunlink_lookup(pag, next_agino);
> if (!ip)
> - return -EFSCORRUPTED;
> + return -ENOLINK;
> +
> ip->i_prev_unlinked = prev_agino;
> return 0;
> }
> @@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
> return 0;
> }
>
> +/*
> + * Load the inode @next_agino into the cache and set its prev_unlinked pointer
> + * to @prev_agino. Caller must hold the AGI to synchronize with other changes
> + * to the unlinked list.
> + */
> +STATIC int
> +xfs_iunlink_reload_next(
> + struct xfs_trans *tp,
> + struct xfs_buf *agibp,
> + xfs_agino_t prev_agino,
> + xfs_agino_t next_agino)
> +{
> + struct xfs_perag *pag = agibp->b_pag;
> + struct xfs_mount *mp = pag->pag_mount;
> + struct xfs_inode *next_ip = NULL;
> + xfs_ino_t ino;
> + int error;
> +
> + ASSERT(next_agino != NULLAGINO);
> +
> +#ifdef DEBUG
> + rcu_read_lock();
> + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
> + ASSERT(next_ip == NULL);
> + rcu_read_unlock();
> +#endif
> +
> + xfs_info_ratelimited(mp,
> + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
> + next_agino, pag->pag_agno);
> +
> + /*
> + * Use an untrusted lookup to be cautious in case the AGI has been
> + * corrupted and now points at a free inode. That shouldn't happen,
> + * but we'd rather shut down now since we're already running in a weird
> + * situation.
> + */
> + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
> + if (error)
> + return error;
> +
> + /* If this is not an unlinked inode, something is very wrong. */
> + if (VFS_I(next_ip)->i_nlink != 0) {
> + error = -EFSCORRUPTED;
> + goto rele;
> + }
> +
> + next_ip->i_prev_unlinked = prev_agino;
> + trace_xfs_iunlink_reload_next(next_ip);
> +rele:
> + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
> + xfs_irele(next_ip);
> + return error;
> +}
> +
> static int
> xfs_iunlink_insert_inode(
> struct xfs_trans *tp,
> @@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
> * inode.
> */
> error = xfs_iunlink_update_backref(pag, agino, next_agino);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
> if (error)
> return error;
>
> @@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
> */
> error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
> ip->i_next_unlinked);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
> + ip->i_next_unlinked);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 36bd42ed9ec8..f4e46bac9b91 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> __entry->new_ptr)
> );
>
> +TRACE_EVENT(xfs_iunlink_reload_next,
> + TP_PROTO(struct xfs_inode *ip),
> + TP_ARGS(ip),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_agnumber_t, agno)
> + __field(xfs_agino_t, agino)
> + __field(xfs_agino_t, prev_agino)
> + __field(xfs_agino_t, next_agino)
> + ),
> + TP_fast_assign(
> + __entry->dev = ip->i_mount->m_super->s_dev;
> + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> + __entry->prev_agino = ip->i_prev_unlinked;
> + __entry->next_agino = ip->i_next_unlinked;
> + ),
> + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->agno,
> + __entry->agino,
> + __entry->prev_agino,
> + __entry->next_agino)
> +);
> +
> DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> TP_PROTO(struct xfs_inode *ip),
> TP_ARGS(ip),
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
` (5 preceding siblings ...)
2023-08-31 17:43 ` Eric Sandeen
@ 2023-08-31 18:10 ` Bill O'Donnell
2023-08-31 20:18 ` Bill O'Donnell
2023-09-01 14:31 ` Bill O'Donnell
7 siblings, 1 reply; 15+ messages in thread
From: Bill O'Donnell @ 2023-08-31 18:10 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, Eric Sandeen, xfs, shrikanth hegde, Ritesh Harjani
On Wed, Aug 30, 2023 at 08:26:59AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> shrikanth hegde reports that filesystems fail shortly after mount with
> the following failure:
>
> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>
> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>
> From diagnostic data collected by the bug reporters, it would appear
> that we cleanly mounted a filesystem that contained unlinked inodes.
> Unlinked inodes are only processed as a final step of log recovery,
> which means that clean mounts do not process the unlinked list at all.
>
> Prior to the introduction of the incore unlinked lists, this wasn't a
> problem because the unlink code would (very expensively) traverse the
> entire ondisk metadata iunlink chain to keep things up to date.
> However, the incore unlinked list code complains when it realizes that
> it is out of sync with the ondisk metadata and shuts down the fs, which
> is bad.
>
> Ritesh proposed to solve this problem by unconditionally parsing the
> unlinked lists at mount time, but this imposes a mount time cost for
> every filesystem to catch something that should be very infrequent.
> Instead, let's target the places where we can encounter a next_unlinked
> pointer that refers to an inode that is not in cache, and load it into
> cache.
>
> Note: This patch does not address the problem of iget loading an inode
> from the middle of the iunlink list and needing to set i_prev_unlinked
> correctly.
>
> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
This LGTM. Thanks!
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
> ---
> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> and actually return ENOLINK
> ---
> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> 2 files changed, 96 insertions(+), 4 deletions(-)
>
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 6ee266be45d4..2942002560b5 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
>
> rcu_read_lock();
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> + if (!ip) {
> + /* Caller can handle inode not being in memory. */
> + rcu_read_unlock();
> + return NULL;
> + }
>
> /*
> - * Inode not in memory or in RCU freeing limbo should not happen.
> - * Warn about this and let the caller handle the failure.
> + * Inode in RCU freeing limbo should not happen. Warn about this and
> + * let the caller handle the failure.
> */
> - if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
> + if (WARN_ON_ONCE(!ip->i_ino)) {
> rcu_read_unlock();
> return NULL;
> }
> @@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
>
> ip = xfs_iunlink_lookup(pag, next_agino);
> if (!ip)
> - return -EFSCORRUPTED;
> + return -ENOLINK;
> +
> ip->i_prev_unlinked = prev_agino;
> return 0;
> }
> @@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
> return 0;
> }
>
> +/*
> + * Load the inode @next_agino into the cache and set its prev_unlinked pointer
> + * to @prev_agino. Caller must hold the AGI to synchronize with other changes
> + * to the unlinked list.
> + */
> +STATIC int
> +xfs_iunlink_reload_next(
> + struct xfs_trans *tp,
> + struct xfs_buf *agibp,
> + xfs_agino_t prev_agino,
> + xfs_agino_t next_agino)
> +{
> + struct xfs_perag *pag = agibp->b_pag;
> + struct xfs_mount *mp = pag->pag_mount;
> + struct xfs_inode *next_ip = NULL;
> + xfs_ino_t ino;
> + int error;
> +
> + ASSERT(next_agino != NULLAGINO);
> +
> +#ifdef DEBUG
> + rcu_read_lock();
> + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
> + ASSERT(next_ip == NULL);
> + rcu_read_unlock();
> +#endif
> +
> + xfs_info_ratelimited(mp,
> + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
> + next_agino, pag->pag_agno);
> +
> + /*
> + * Use an untrusted lookup to be cautious in case the AGI has been
> + * corrupted and now points at a free inode. That shouldn't happen,
> + * but we'd rather shut down now since we're already running in a weird
> + * situation.
> + */
> + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
> + if (error)
> + return error;
> +
> + /* If this is not an unlinked inode, something is very wrong. */
> + if (VFS_I(next_ip)->i_nlink != 0) {
> + error = -EFSCORRUPTED;
> + goto rele;
> + }
> +
> + next_ip->i_prev_unlinked = prev_agino;
> + trace_xfs_iunlink_reload_next(next_ip);
> +rele:
> + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
> + xfs_irele(next_ip);
> + return error;
> +}
> +
> static int
> xfs_iunlink_insert_inode(
> struct xfs_trans *tp,
> @@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
> * inode.
> */
> error = xfs_iunlink_update_backref(pag, agino, next_agino);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
> if (error)
> return error;
>
> @@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
> */
> error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
> ip->i_next_unlinked);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
> + ip->i_next_unlinked);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 36bd42ed9ec8..f4e46bac9b91 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> __entry->new_ptr)
> );
>
> +TRACE_EVENT(xfs_iunlink_reload_next,
> + TP_PROTO(struct xfs_inode *ip),
> + TP_ARGS(ip),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_agnumber_t, agno)
> + __field(xfs_agino_t, agino)
> + __field(xfs_agino_t, prev_agino)
> + __field(xfs_agino_t, next_agino)
> + ),
> + TP_fast_assign(
> + __entry->dev = ip->i_mount->m_super->s_dev;
> + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> + __entry->prev_agino = ip->i_prev_unlinked;
> + __entry->next_agino = ip->i_next_unlinked;
> + ),
> + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->agno,
> + __entry->agino,
> + __entry->prev_agino,
> + __entry->next_agino)
> +);
> +
> DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> TP_PROTO(struct xfs_inode *ip),
> TP_ARGS(ip),
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] xfs_db: dump unlinked buckets
2023-08-30 23:25 ` [PATCH 1/2] xfs_db: dump unlinked buckets Darrick J. Wong
@ 2023-08-31 20:01 ` Bill O'Donnell
0 siblings, 0 replies; 15+ messages in thread
From: Bill O'Donnell @ 2023-08-31 20:01 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, Eric Sandeen, xfs, shrikanth hegde, Ritesh Harjani
On Wed, Aug 30, 2023 at 04:25:09PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Create a new command to dump the resource usage of files in the unlinked
> buckets.
>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
> ---
> db/Makefile | 2
> db/command.c | 1
> db/command.h | 1
> db/iunlink.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++
> libxfs/libxfs_api_defs.h | 1
> man/man8/xfs_db.8 | 19 ++++
> 6 files changed, 227 insertions(+), 1 deletion(-)
> create mode 100644 db/iunlink.c
>
> diff --git a/db/Makefile b/db/Makefile
> index 2f95f670..d00801ab 100644
> --- a/db/Makefile
> +++ b/db/Makefile
> @@ -14,7 +14,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
> io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
> sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
> fuzz.h obfuscate.h
> -CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
> +CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c iunlink.c namei.c \
> timelimit.c
> LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
>
> diff --git a/db/command.c b/db/command.c
> index 02f778b9..b4021c86 100644
> --- a/db/command.c
> +++ b/db/command.c
> @@ -127,6 +127,7 @@ init_commands(void)
> info_init();
> inode_init();
> input_init();
> + iunlink_init();
> logres_init();
> logformat_init();
> io_init();
> diff --git a/db/command.h b/db/command.h
> index 498983ff..a89e7150 100644
> --- a/db/command.h
> +++ b/db/command.h
> @@ -34,3 +34,4 @@ extern void info_init(void);
> extern void btheight_init(void);
> extern void timelimit_init(void);
> extern void namei_init(void);
> +extern void iunlink_init(void);
> diff --git a/db/iunlink.c b/db/iunlink.c
> new file mode 100644
> index 00000000..303b5daf
> --- /dev/null
> +++ b/db/iunlink.c
> @@ -0,0 +1,204 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
> + * Author: Darrick J. Wong <djwong@kernel.org>
> + */
> +#include "libxfs.h"
> +#include "command.h"
> +#include "output.h"
> +#include "init.h"
> +
> +static xfs_filblks_t
> +count_rtblocks(
> + struct xfs_inode *ip)
> +{
> + struct xfs_iext_cursor icur;
> + struct xfs_bmbt_irec got;
> + xfs_filblks_t count = 0;
> + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
> + int error;
> +
> + error = -libxfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> + if (error) {
> + dbprintf(
> +_("could not read AG %u agino %u extents, err=%d\n"),
> + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
> + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
> + error);
> + return 0;
> + }
> +
> + for_each_xfs_iext(ifp, &icur, &got)
> + if (!isnullstartblock(got.br_startblock))
> + count += got.br_blockcount;
> + return count;
> +}
> +
> +static xfs_agino_t
> +get_next_unlinked(
> + xfs_agnumber_t agno,
> + xfs_agino_t agino,
> + bool verbose)
> +{
> + struct xfs_buf *ino_bp;
> + struct xfs_dinode *dip;
> + struct xfs_inode *ip;
> + xfs_ino_t ino;
> + xfs_agino_t ret;
> + int error;
> +
> + ino = XFS_AGINO_TO_INO(mp, agno, agino);
> + error = -libxfs_iget(mp, NULL, ino, 0, &ip);
> + if (error)
> + goto bad;
> +
> + if (verbose) {
> + xfs_filblks_t blocks, rtblks = 0;
> +
> + if (XFS_IS_REALTIME_INODE(ip))
> + rtblks = count_rtblocks(ip);
> + blocks = ip->i_nblocks - rtblks;
> +
> + dbprintf(_(" blocks %llu rtblocks %llu\n"),
> + blocks, rtblks);
> + } else {
> + dbprintf("\n");
> + }
> +
> + error = -libxfs_imap_to_bp(mp, NULL, &ip->i_imap, &ino_bp);
> + if (error)
> + goto bad;
> +
> + dip = xfs_buf_offset(ino_bp, ip->i_imap.im_boffset);
> + ret = be32_to_cpu(dip->di_next_unlinked);
> + libxfs_buf_relse(ino_bp);
> +
> + return ret;
> +bad:
> + dbprintf(_("AG %u agino %u: %s\n"), agno, agino, strerror(error));
> + return NULLAGINO;
> +}
> +
> +static void
> +dump_unlinked_bucket(
> + xfs_agnumber_t agno,
> + struct xfs_buf *agi_bp,
> + unsigned int bucket,
> + bool quiet,
> + bool verbose)
> +{
> + struct xfs_agi *agi = agi_bp->b_addr;
> + xfs_agino_t agino;
> + unsigned int i = 0;
> +
> + agino = be32_to_cpu(agi->agi_unlinked[bucket]);
> + if (agino != NULLAGINO)
> + dbprintf(_("AG %u bucket %u agino %u"), agno, bucket, agino);
> + else if (!quiet && agino == NULLAGINO)
> + dbprintf(_("AG %u bucket %u agino NULL\n"), agno, bucket);
> +
> + while (agino != NULLAGINO) {
> + agino = get_next_unlinked(agno, agino, verbose);
> + if (agino != NULLAGINO)
> + dbprintf(_(" [%u] agino %u"), i++, agino);
> + else if (!quiet && agino == NULLAGINO)
> + dbprintf(_(" [%u] agino NULL\n"), i++);
> + }
> +}
> +
> +static void
> +dump_unlinked(
> + struct xfs_perag *pag,
> + unsigned int bucket,
> + bool quiet,
> + bool verbose)
> +{
> + struct xfs_buf *agi_bp;
> + xfs_agnumber_t agno = pag->pag_agno;
> + int error;
> +
> + error = -libxfs_ialloc_read_agi(pag, NULL, &agi_bp);
> + if (error) {
> + dbprintf(_("AGI %u: %s\n"), agno, strerror(errno));
> + return;
> + }
> +
> + if (bucket != -1U) {
> + dump_unlinked_bucket(agno, agi_bp, bucket, quiet, verbose);
> + goto relse;
> + }
> +
> + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
> + dump_unlinked_bucket(agno, agi_bp, bucket, quiet, verbose);
> + }
> +
> +relse:
> + libxfs_buf_relse(agi_bp);
> +}
> +
> +static int
> +dump_iunlinked_f(
> + int argc,
> + char **argv)
> +{
> + struct xfs_perag *pag;
> + xfs_agnumber_t agno = NULLAGNUMBER;
> + unsigned int bucket = -1U;
> + bool quiet = false;
> + bool verbose = false;
> + int c;
> +
> + while ((c = getopt(argc, argv, "a:b:qv")) != EOF) {
> + switch (c) {
> + case 'a':
> + agno = atoi(optarg);
> + if (agno >= mp->m_sb.sb_agcount) {
> + dbprintf(_("Unknown AG %u, agcount is %u.\n"),
> + agno, mp->m_sb.sb_agcount);
> + return 0;
> + }
> + break;
> + case 'b':
> + bucket = atoi(optarg);
> + if (bucket >= XFS_AGI_UNLINKED_BUCKETS) {
> + dbprintf(_("Unknown bucket %u, max is 63.\n"),
> + bucket);
> + return 0;
> + }
> + break;
> + case 'q':
> + quiet = true;
> + break;
> + case 'v':
> + verbose = true;
> + break;
> + default:
> + dbprintf(_("Bad option for dump_iunlinked command.\n"));
> + return 0;
> + }
> + }
> +
> + if (agno != NULLAGNUMBER) {
> + struct xfs_perag *pag = libxfs_perag_get(mp, agno);
> +
> + dump_unlinked(pag, bucket, quiet, verbose);
> + libxfs_perag_put(pag);
> + return 0;
> + }
> +
> + for_each_perag(mp, agno, pag)
> + dump_unlinked(pag, bucket, quiet, verbose);
> +
> + return 0;
> +}
> +
> +static const cmdinfo_t dump_iunlinked_cmd =
> + { "dump_iunlinked", NULL, dump_iunlinked_f, 0, -1, 0,
> + N_("[-a agno] [-b bucket] [-q] [-v]"),
> + N_("dump chain of unlinked inode buckets"), NULL };
> +
> +void
> +iunlink_init(void)
> +{
> + add_command(&dump_iunlinked_cmd);
> +}
> diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
> index 026aa510..ddba5c7c 100644
> --- a/libxfs/libxfs_api_defs.h
> +++ b/libxfs/libxfs_api_defs.h
> @@ -125,6 +125,7 @@
> #define xfs_idestroy_fork libxfs_idestroy_fork
> #define xfs_iext_lookup_extent libxfs_iext_lookup_extent
> #define xfs_ifork_zap_attr libxfs_ifork_zap_attr
> +#define xfs_imap_to_bp libxfs_imap_to_bp
> #define xfs_initialize_perag libxfs_initialize_perag
> #define xfs_initialize_perag_data libxfs_initialize_perag_data
> #define xfs_init_local_fork libxfs_init_local_fork
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index 60dcdc52..2d6d0da4 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -579,6 +579,25 @@ print the current debug option bits. These are for the use of the implementor.
> .BI "dquot [" \-g | \-p | \-u ] " id"
> Set current address to a group, project or user quota block for the given ID. Defaults to user quota.
> .TP
> +.BI "dump_iunlinked [-a " agno " ] [-b " bucket " ] [-q] [-v]"
> +Dump the contents of unlinked buckets.
> +
> +Options include:
> +.RS 1.0i
> +.TP 0.4i
> +.B \-a
> +Print only this AG's unlinked buckets.
> +.TP 0.4i
> +.B \-b
> +Print only this bucket within each AGI.
> +.TP 0.4i
> +.B \-q
> +Only print the essentials.
> +.TP 0.4i
> +.B \-v
> +Print resource usage of each file on the unlinked lists.
> +.RE
> +.TP
> .BI "echo [" arg "] ..."
> Echo the arguments to the output.
> .TP
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 2/2] xfs_db: create unlinked inodes
2023-08-30 23:25 ` [PATCH 2/2] xfs_db: create unlinked inodes Darrick J. Wong
@ 2023-08-31 20:02 ` Bill O'Donnell
0 siblings, 0 replies; 15+ messages in thread
From: Bill O'Donnell @ 2023-08-31 20:02 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, Eric Sandeen, xfs, shrikanth hegde, Ritesh Harjani
On Wed, Aug 30, 2023 at 04:25:29PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Create an expert-mode debugger command to create unlinked inodes.
> This will hopefully aid in simulation of leaked unlinked inode handling
> in the kernel and elsewhere.
>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
> ---
> db/iunlink.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++
> libxfs/libxfs_api_defs.h | 1
> man/man8/xfs_db.8 | 11 +++
> 3 files changed, 208 insertions(+)
>
> diff --git a/db/iunlink.c b/db/iunlink.c
> index 303b5daf..d87562e3 100644
> --- a/db/iunlink.c
> +++ b/db/iunlink.c
> @@ -197,8 +197,204 @@ static const cmdinfo_t dump_iunlinked_cmd =
> N_("[-a agno] [-b bucket] [-q] [-v]"),
> N_("dump chain of unlinked inode buckets"), NULL };
>
> +/*
> + * Look up the inode cluster buffer and log the on-disk unlinked inode change
> + * we need to make.
> + */
> +static int
> +iunlink_log_dinode(
> + struct xfs_trans *tp,
> + struct xfs_inode *ip,
> + struct xfs_perag *pag,
> + xfs_agino_t next_agino)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> + struct xfs_dinode *dip;
> + struct xfs_buf *ibp;
> + int offset;
> + int error;
> +
> + error = -libxfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
> + if (error)
> + return error;
> +
> + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
> +
> + dip->di_next_unlinked = cpu_to_be32(next_agino);
> + offset = ip->i_imap.im_boffset +
> + offsetof(struct xfs_dinode, di_next_unlinked);
> +
> + libxfs_dinode_calc_crc(mp, dip);
> + libxfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
> + return 0;
> +}
> +
> +static int
> +iunlink_insert_inode(
> + struct xfs_trans *tp,
> + struct xfs_perag *pag,
> + struct xfs_buf *agibp,
> + struct xfs_inode *ip)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> + struct xfs_agi *agi = agibp->b_addr;
> + xfs_agino_t next_agino;
> + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
> + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
> + int offset;
> + int error;
> +
> + /*
> + * Get the index into the agi hash table for the list this inode will
> + * go on. Make sure the pointer isn't garbage and that this inode
> + * isn't already on the list.
> + */
> + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
> + if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino))
> + return EFSCORRUPTED;
> +
> + if (next_agino != NULLAGINO) {
> + /*
> + * There is already another inode in the bucket, so point this
> + * inode to the current head of the list.
> + */
> + error = iunlink_log_dinode(tp, ip, pag, next_agino);
> + if (error)
> + return error;
> + }
> +
> + /* Update the bucket. */
> + agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
> + offset = offsetof(struct xfs_agi, agi_unlinked) +
> + (sizeof(xfs_agino_t) * bucket_index);
> + libxfs_trans_log_buf(tp, agibp, offset,
> + offset + sizeof(xfs_agino_t) - 1);
> + return 0;
> +}
> +
> +/*
> + * This is called when the inode's link count has gone to 0 or we are creating
> + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
> + *
> + * We place the on-disk inode on a list in the AGI. It will be pulled from this
> + * list when the inode is freed.
> + */
> +static int
> +iunlink(
> + struct xfs_trans *tp,
> + struct xfs_inode *ip)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> + struct xfs_perag *pag;
> + struct xfs_buf *agibp;
> + int error;
> +
> + ASSERT(VFS_I(ip)->i_nlink == 0);
> + ASSERT(VFS_I(ip)->i_mode != 0);
> +
> + pag = libxfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
> +
> + /* Get the agi buffer first. It ensures lock ordering on the list. */
> + error = -libxfs_read_agi(pag, tp, &agibp);
> + if (error)
> + goto out;
> +
> + error = iunlink_insert_inode(tp, pag, agibp, ip);
> +out:
> + libxfs_perag_put(pag);
> + return error;
> +}
> +
> +static int
> +create_unlinked(
> + struct xfs_mount *mp)
> +{
> + struct cred cr = { };
> + struct fsxattr fsx = { };
> + struct xfs_inode *ip;
> + struct xfs_trans *tp;
> + unsigned int resblks;
> + int error;
> +
> + resblks = XFS_IALLOC_SPACE_RES(mp);
> + error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_create_tmpfile, resblks,
> + 0, 0, &tp);
> + if (error) {
> + dbprintf(_("alloc trans: %s\n"), strerror(error));
> + return error;
> + }
> +
> + error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG | 0600, 0, 0, &cr, &fsx,
> + &ip);
> + if (error) {
> + dbprintf(_("create inode: %s\n"), strerror(error));
> + goto out_cancel;
> + }
> +
> + error = iunlink(tp, ip);
> + if (error) {
> + dbprintf(_("unlink inode: %s\n"), strerror(error));
> + goto out_rele;
> + }
> +
> + error = -libxfs_trans_commit(tp);
> + if (error)
> + dbprintf(_("commit inode: %s\n"), strerror(error));
> +
> + dbprintf(_("Created unlinked inode %llu in agno %u\n"),
> + (unsigned long long)ip->i_ino,
> + XFS_INO_TO_AGNO(mp, ip->i_ino));
> + libxfs_irele(ip);
> + return error;
> +out_rele:
> + libxfs_irele(ip);
> +out_cancel:
> + libxfs_trans_cancel(tp);
> + return error;
> +}
> +
> +static int
> +iunlink_f(
> + int argc,
> + char **argv)
> +{
> + int nr = 1;
> + int c;
> + int error;
> +
> + while ((c = getopt(argc, argv, "n:")) != EOF) {
> + switch (c) {
> + case 'n':
> + nr = atoi(optarg);
> + if (nr <= 0) {
> + dbprintf(_("%s: need positive number\n"));
> + return 0;
> + }
> + break;
> + default:
> + dbprintf(_("Bad option for iunlink command.\n"));
> + return 0;
> + }
> + }
> +
> + for (c = 0; c < nr; c++) {
> + error = create_unlinked(mp);
> + if (error)
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> +static const cmdinfo_t iunlink_cmd =
> + { "iunlink", NULL, iunlink_f, 0, -1, 0,
> + N_("[-n nr]"),
> + N_("allocate inodes and put them on the unlinked list"), NULL };
> +
> void
> iunlink_init(void)
> {
> add_command(&dump_iunlinked_cmd);
> + if (expert_mode)
> + add_command(&iunlink_cmd);
> }
> diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
> index ddba5c7c..04277c00 100644
> --- a/libxfs/libxfs_api_defs.h
> +++ b/libxfs/libxfs_api_defs.h
> @@ -149,6 +149,7 @@
> #define xfs_prealloc_blocks libxfs_prealloc_blocks
>
> #define xfs_read_agf libxfs_read_agf
> +#define xfs_read_agi libxfs_read_agi
> #define xfs_refc_block libxfs_refc_block
> #define xfs_refcountbt_calc_reserves libxfs_refcountbt_calc_reserves
> #define xfs_refcountbt_calc_size libxfs_refcountbt_calc_size
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index 2d6d0da4..f53ddd67 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -840,6 +840,17 @@ Set the current inode number. If no
> .I inode#
> is given, print the current inode number.
> .TP
> +.BI "iunlink [-n " nr " ]"
> +Allocate inodes and put them on the unlinked list.
> +
> +Options include:
> +.RS 1.0i
> +.TP 0.4i
> +.B \-n
> +Create this number of unlinked inodes.
> +If not specified, 1 inode will be created.
> +.RE
> +.TP
> .BI "label [" label ]
> Set the filesystem label. The filesystem label can be used by
> .BR mount (8)
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-31 18:10 ` Bill O'Donnell
@ 2023-08-31 20:18 ` Bill O'Donnell
0 siblings, 0 replies; 15+ messages in thread
From: Bill O'Donnell @ 2023-08-31 20:18 UTC (permalink / raw)
To: Bill O'Donnell
Cc: Darrick J. Wong, Dave Chinner, Eric Sandeen, xfs, shrikanth hegde,
Ritesh Harjani
I jumped the gun and only reviewed the userspace patches. Sorry,
I'll actually look at the kernel patch now. ;)
-Bill
On Thu, Aug 31, 2023 at 01:10:49PM -0500, Bill O'Donnell wrote:
> On Wed, Aug 30, 2023 at 08:26:59AM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > shrikanth hegde reports that filesystems fail shortly after mount with
> > the following failure:
> >
> > WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
> >
> > This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
> >
> > ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> > if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
> >
> > From diagnostic data collected by the bug reporters, it would appear
> > that we cleanly mounted a filesystem that contained unlinked inodes.
> > Unlinked inodes are only processed as a final step of log recovery,
> > which means that clean mounts do not process the unlinked list at all.
> >
> > Prior to the introduction of the incore unlinked lists, this wasn't a
> > problem because the unlink code would (very expensively) traverse the
> > entire ondisk metadata iunlink chain to keep things up to date.
> > However, the incore unlinked list code complains when it realizes that
> > it is out of sync with the ondisk metadata and shuts down the fs, which
> > is bad.
> >
> > Ritesh proposed to solve this problem by unconditionally parsing the
> > unlinked lists at mount time, but this imposes a mount time cost for
> > every filesystem to catch something that should be very infrequent.
> > Instead, let's target the places where we can encounter a next_unlinked
> > pointer that refers to an inode that is not in cache, and load it into
> > cache.
> >
> > Note: This patch does not address the problem of iget loading an inode
> > from the middle of the iunlink list and needing to set i_prev_unlinked
> > correctly.
> >
> > Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> > Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
>
> This LGTM. Thanks!
> Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
>
> > ---
> > v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> > and actually return ENOLINK
> > ---
> > fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> > fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> > 2 files changed, 96 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 6ee266be45d4..2942002560b5 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
> >
> > rcu_read_lock();
> > ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> > + if (!ip) {
> > + /* Caller can handle inode not being in memory. */
> > + rcu_read_unlock();
> > + return NULL;
> > + }
> >
> > /*
> > - * Inode not in memory or in RCU freeing limbo should not happen.
> > - * Warn about this and let the caller handle the failure.
> > + * Inode in RCU freeing limbo should not happen. Warn about this and
> > + * let the caller handle the failure.
> > */
> > - if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
> > + if (WARN_ON_ONCE(!ip->i_ino)) {
> > rcu_read_unlock();
> > return NULL;
> > }
> > @@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
> >
> > ip = xfs_iunlink_lookup(pag, next_agino);
> > if (!ip)
> > - return -EFSCORRUPTED;
> > + return -ENOLINK;
> > +
> > ip->i_prev_unlinked = prev_agino;
> > return 0;
> > }
> > @@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
> > return 0;
> > }
> >
> > +/*
> > + * Load the inode @next_agino into the cache and set its prev_unlinked pointer
> > + * to @prev_agino. Caller must hold the AGI to synchronize with other changes
> > + * to the unlinked list.
> > + */
> > +STATIC int
> > +xfs_iunlink_reload_next(
> > + struct xfs_trans *tp,
> > + struct xfs_buf *agibp,
> > + xfs_agino_t prev_agino,
> > + xfs_agino_t next_agino)
> > +{
> > + struct xfs_perag *pag = agibp->b_pag;
> > + struct xfs_mount *mp = pag->pag_mount;
> > + struct xfs_inode *next_ip = NULL;
> > + xfs_ino_t ino;
> > + int error;
> > +
> > + ASSERT(next_agino != NULLAGINO);
> > +
> > +#ifdef DEBUG
> > + rcu_read_lock();
> > + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
> > + ASSERT(next_ip == NULL);
> > + rcu_read_unlock();
> > +#endif
> > +
> > + xfs_info_ratelimited(mp,
> > + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
> > + next_agino, pag->pag_agno);
> > +
> > + /*
> > + * Use an untrusted lookup to be cautious in case the AGI has been
> > + * corrupted and now points at a free inode. That shouldn't happen,
> > + * but we'd rather shut down now since we're already running in a weird
> > + * situation.
> > + */
> > + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> > + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
> > + if (error)
> > + return error;
> > +
> > + /* If this is not an unlinked inode, something is very wrong. */
> > + if (VFS_I(next_ip)->i_nlink != 0) {
> > + error = -EFSCORRUPTED;
> > + goto rele;
> > + }
> > +
> > + next_ip->i_prev_unlinked = prev_agino;
> > + trace_xfs_iunlink_reload_next(next_ip);
> > +rele:
> > + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
> > + xfs_irele(next_ip);
> > + return error;
> > +}
> > +
> > static int
> > xfs_iunlink_insert_inode(
> > struct xfs_trans *tp,
> > @@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
> > * inode.
> > */
> > error = xfs_iunlink_update_backref(pag, agino, next_agino);
> > + if (error == -ENOLINK)
> > + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
> > if (error)
> > return error;
> >
> > @@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
> > */
> > error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
> > ip->i_next_unlinked);
> > + if (error == -ENOLINK)
> > + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
> > + ip->i_next_unlinked);
> > if (error)
> > return error;
> >
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 36bd42ed9ec8..f4e46bac9b91 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> > __entry->new_ptr)
> > );
> >
> > +TRACE_EVENT(xfs_iunlink_reload_next,
> > + TP_PROTO(struct xfs_inode *ip),
> > + TP_ARGS(ip),
> > + TP_STRUCT__entry(
> > + __field(dev_t, dev)
> > + __field(xfs_agnumber_t, agno)
> > + __field(xfs_agino_t, agino)
> > + __field(xfs_agino_t, prev_agino)
> > + __field(xfs_agino_t, next_agino)
> > + ),
> > + TP_fast_assign(
> > + __entry->dev = ip->i_mount->m_super->s_dev;
> > + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> > + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> > + __entry->prev_agino = ip->i_prev_unlinked;
> > + __entry->next_agino = ip->i_next_unlinked;
> > + ),
> > + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
> > + MAJOR(__entry->dev), MINOR(__entry->dev),
> > + __entry->agno,
> > + __entry->agino,
> > + __entry->prev_agino,
> > + __entry->next_agino)
> > +);
> > +
> > DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> > TP_PROTO(struct xfs_inode *ip),
> > TP_ARGS(ip),
> >
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-31 12:39 ` [PATCH v2] xfs: load uncached unlinked inodes into memory " Ritesh Harjani
@ 2023-08-31 20:39 ` Eric Sandeen
2023-08-31 22:44 ` Darrick J. Wong
0 siblings, 1 reply; 15+ messages in thread
From: Eric Sandeen @ 2023-08-31 20:39 UTC (permalink / raw)
To: Ritesh Harjani (IBM), Darrick J. Wong, Dave Chinner, Eric Sandeen
Cc: xfs, shrikanth hegde
On 8/31/23 7:39 AM, Ritesh Harjani (IBM) wrote:
> "Darrick J. Wong" <djwong@kernel.org> writes:
>
>> From: Darrick J. Wong <djwong@kernel.org>
>>
>> shrikanth hegde reports that filesystems fail shortly after mount with
>> the following failure:
>>
>> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>>
>> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>>
>> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
>> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>>
>> From diagnostic data collected by the bug reporters, it would appear
>> that we cleanly mounted a filesystem that contained unlinked inodes.
>> Unlinked inodes are only processed as a final step of log recovery,
>> which means that clean mounts do not process the unlinked list at all.
>>
>> Prior to the introduction of the incore unlinked lists, this wasn't a
>> problem because the unlink code would (very expensively) traverse the
>> entire ondisk metadata iunlink chain to keep things up to date.
>> However, the incore unlinked list code complains when it realizes that
>> it is out of sync with the ondisk metadata and shuts down the fs, which
>> is bad.
>>
>> Ritesh proposed to solve this problem by unconditionally parsing the
>> unlinked lists at mount time, but this imposes a mount time cost for
>> every filesystem to catch something that should be very infrequent.
>> Instead, let's target the places where we can encounter a next_unlinked
>> pointer that refers to an inode that is not in cache, and load it into
>> cache.
>>
>> Note: This patch does not address the problem of iget loading an inode
>> from the middle of the iunlink list and needing to set i_prev_unlinked
>> correctly.
>>
>> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
>> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
>> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
>> ---
>> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
>> and actually return ENOLINK
>> ---
>> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
>> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
>> 2 files changed, 96 insertions(+), 4 deletions(-)
>
> Hi Darrick,
>
> Thanks for taking a look at this. I tested this patch on the setup where
> Shrikanth earlier saw the crash. I still can see a problem. I saw it is
> taking the branch from
>
> + /* If this is not an unlinked inode, something is very wrong. */
> + if (VFS_I(next_ip)->i_nlink != 0) {
> + error = -EFSCORRUPTED;
> + goto rele;
> + }
>
> Here are the logs of reference -
>
> [ 21.399573] XFS (dm-0): Found unrecovered unlinked inode 0x2ec44d in AG 0x0. Initiating recovery.
> [ 21.400150] XFS (dm-0): Internal error xfs_trans_cancel at line 1104 of file fs/xfs/xfs_trans.c. Caller xfs_remove+0x1a0/0x310 [xfs]
Do you have a metadump for that filesystem, to examine that inode?
-Eric
> [ 21.400222] CPU: 0 PID: 1629 Comm: systemd-tmpfile Not tainted 6.5.0+ #2
> [ 21.400226] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1010.22 (NH1010_122) hv:phyp pSeries
> [ 21.400230] Call Trace:
> [ 21.400231] [c000000014cdbb70] [c000000000f377b8] dump_stack_lvl+0x6c/0x9c (unreliable)
> [ 21.400239] [c000000014cdbba0] [c008000000f7c204] xfs_error_report+0x5c/0x80 [xfs]
> [ 21.400303] [c000000014cdbc00] [c008000000fab320] xfs_trans_cancel+0x178/0x1b0 [xfs]
> [ 21.400371] [c000000014cdbc50] [c008000000f999d8] xfs_remove+0x1a0/0x310 [xfs]
> [ 21.400432] [c000000014cdbcc0] [c008000000f93eb0] xfs_vn_unlink+0x68/0xf0 [xfs]
> [ 21.400493] [c000000014cdbd20] [c0000000005b8038] vfs_rmdir+0x178/0x300
> [ 21.400498] [c000000014cdbd60] [c0000000005be444] do_rmdir+0x124/0x240
> [ 21.400502] [c000000014cdbdf0] [c0000000005be594] sys_rmdir+0x34/0x50
> [ 21.400506] [c000000014cdbe10] [c000000000033c38] system_call_exception+0x148/0x3a0
> [ 21.400511] [c000000014cdbe50] [c00000000000c6d4] system_call_common+0xf4/0x258
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-31 20:39 ` Eric Sandeen
@ 2023-08-31 22:44 ` Darrick J. Wong
0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2023-08-31 22:44 UTC (permalink / raw)
To: Eric Sandeen
Cc: Ritesh Harjani (IBM), Dave Chinner, Eric Sandeen, xfs,
shrikanth hegde
On Thu, Aug 31, 2023 at 03:39:28PM -0500, Eric Sandeen wrote:
> On 8/31/23 7:39 AM, Ritesh Harjani (IBM) wrote:
> > "Darrick J. Wong" <djwong@kernel.org> writes:
> >
> >> From: Darrick J. Wong <djwong@kernel.org>
> >>
> >> shrikanth hegde reports that filesystems fail shortly after mount with
> >> the following failure:
> >>
> >> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
> >>
> >> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
> >>
> >> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> >> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
> >>
> >> From diagnostic data collected by the bug reporters, it would appear
> >> that we cleanly mounted a filesystem that contained unlinked inodes.
> >> Unlinked inodes are only processed as a final step of log recovery,
> >> which means that clean mounts do not process the unlinked list at all.
> >>
> >> Prior to the introduction of the incore unlinked lists, this wasn't a
> >> problem because the unlink code would (very expensively) traverse the
> >> entire ondisk metadata iunlink chain to keep things up to date.
> >> However, the incore unlinked list code complains when it realizes that
> >> it is out of sync with the ondisk metadata and shuts down the fs, which
> >> is bad.
> >>
> >> Ritesh proposed to solve this problem by unconditionally parsing the
> >> unlinked lists at mount time, but this imposes a mount time cost for
> >> every filesystem to catch something that should be very infrequent.
> >> Instead, let's target the places where we can encounter a next_unlinked
> >> pointer that refers to an inode that is not in cache, and load it into
> >> cache.
> >>
> >> Note: This patch does not address the problem of iget loading an inode
> >> from the middle of the iunlink list and needing to set i_prev_unlinked
> >> correctly.
> >>
> >> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> >> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> >> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> >> ---
> >> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> >> and actually return ENOLINK
> >> ---
> >> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> >> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> >> 2 files changed, 96 insertions(+), 4 deletions(-)
> >
> > Hi Darrick,
> >
> > Thanks for taking a look at this. I tested this patch on the setup where
> > Shrikanth earlier saw the crash. I still can see a problem. I saw it is
> > taking the branch from
> >
> > + /* If this is not an unlinked inode, something is very wrong. */
> > + if (VFS_I(next_ip)->i_nlink != 0) {
> > + error = -EFSCORRUPTED;
> > + goto rele;
> > + }
> >
> > Here are the logs of reference -
> >
> > [ 21.399573] XFS (dm-0): Found unrecovered unlinked inode 0x2ec44d in AG 0x0. Initiating recovery.
> > [ 21.400150] XFS (dm-0): Internal error xfs_trans_cancel at line 1104 of file fs/xfs/xfs_trans.c. Caller xfs_remove+0x1a0/0x310 [xfs]
>
> Do you have a metadump for that filesystem, to examine that inode?
IIRC, Ritesh's problem was that there were inodes on the unlinked list
*and* they had nonzero i_nlink. This patch doesn't address that; you'll
have to wait for the online repair version.
--D
> -Eric
>
> > [ 21.400222] CPU: 0 PID: 1629 Comm: systemd-tmpfile Not tainted 6.5.0+ #2
> > [ 21.400226] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1010.22 (NH1010_122) hv:phyp pSeries
> > [ 21.400230] Call Trace:
> > [ 21.400231] [c000000014cdbb70] [c000000000f377b8] dump_stack_lvl+0x6c/0x9c (unreliable)
> > [ 21.400239] [c000000014cdbba0] [c008000000f7c204] xfs_error_report+0x5c/0x80 [xfs]
> > [ 21.400303] [c000000014cdbc00] [c008000000fab320] xfs_trans_cancel+0x178/0x1b0 [xfs]
> > [ 21.400371] [c000000014cdbc50] [c008000000f999d8] xfs_remove+0x1a0/0x310 [xfs]
> > [ 21.400432] [c000000014cdbcc0] [c008000000f93eb0] xfs_vn_unlink+0x68/0xf0 [xfs]
> > [ 21.400493] [c000000014cdbd20] [c0000000005b8038] vfs_rmdir+0x178/0x300
> > [ 21.400498] [c000000014cdbd60] [c0000000005be444] do_rmdir+0x124/0x240
> > [ 21.400502] [c000000014cdbdf0] [c0000000005be594] sys_rmdir+0x34/0x50
> > [ 21.400506] [c000000014cdbe10] [c000000000033c38] system_call_exception+0x148/0x3a0
> > [ 21.400511] [c000000014cdbe50] [c00000000000c6d4] system_call_common+0xf4/0x258
>
>
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] xfs: load uncached unlinked inodes into memory on demand
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
` (6 preceding siblings ...)
2023-08-31 18:10 ` Bill O'Donnell
@ 2023-09-01 14:31 ` Bill O'Donnell
7 siblings, 0 replies; 15+ messages in thread
From: Bill O'Donnell @ 2023-09-01 14:31 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Dave Chinner, Eric Sandeen, xfs, shrikanth hegde, Ritesh Harjani
On Wed, Aug 30, 2023 at 08:26:59AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> shrikanth hegde reports that filesystems fail shortly after mount with
> the following failure:
>
> WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
>
> This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
>
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
>
> From diagnostic data collected by the bug reporters, it would appear
> that we cleanly mounted a filesystem that contained unlinked inodes.
> Unlinked inodes are only processed as a final step of log recovery,
> which means that clean mounts do not process the unlinked list at all.
>
> Prior to the introduction of the incore unlinked lists, this wasn't a
> problem because the unlink code would (very expensively) traverse the
> entire ondisk metadata iunlink chain to keep things up to date.
> However, the incore unlinked list code complains when it realizes that
> it is out of sync with the ondisk metadata and shuts down the fs, which
> is bad.
>
> Ritesh proposed to solve this problem by unconditionally parsing the
> unlinked lists at mount time, but this imposes a mount time cost for
> every filesystem to catch something that should be very infrequent.
> Instead, let's target the places where we can encounter a next_unlinked
> pointer that refers to an inode that is not in cache, and load it into
> cache.
>
> Note: This patch does not address the problem of iget loading an inode
> from the middle of the iunlink list and needing to set i_prev_unlinked
> correctly.
>
> Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
> Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
> v2: log that we're doing runtime recovery, dont mess with DONTCACHE,
> and actually return ENOLINK
> ---
> fs/xfs/xfs_inode.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> fs/xfs/xfs_trace.h | 25 +++++++++++++++++
> 2 files changed, 96 insertions(+), 4 deletions(-)
>
Makes sense. Feel free to add my RB.
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 6ee266be45d4..2942002560b5 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
>
> rcu_read_lock();
> ip = radix_tree_lookup(&pag->pag_ici_root, agino);
> + if (!ip) {
> + /* Caller can handle inode not being in memory. */
> + rcu_read_unlock();
> + return NULL;
> + }
>
> /*
> - * Inode not in memory or in RCU freeing limbo should not happen.
> - * Warn about this and let the caller handle the failure.
> + * Inode in RCU freeing limbo should not happen. Warn about this and
> + * let the caller handle the failure.
> */
> - if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
> + if (WARN_ON_ONCE(!ip->i_ino)) {
> rcu_read_unlock();
> return NULL;
> }
> @@ -1858,7 +1863,8 @@ xfs_iunlink_update_backref(
>
> ip = xfs_iunlink_lookup(pag, next_agino);
> if (!ip)
> - return -EFSCORRUPTED;
> + return -ENOLINK;
> +
> ip->i_prev_unlinked = prev_agino;
> return 0;
> }
> @@ -1902,6 +1908,62 @@ xfs_iunlink_update_bucket(
> return 0;
> }
>
> +/*
> + * Load the inode @next_agino into the cache and set its prev_unlinked pointer
> + * to @prev_agino. Caller must hold the AGI to synchronize with other changes
> + * to the unlinked list.
> + */
> +STATIC int
> +xfs_iunlink_reload_next(
> + struct xfs_trans *tp,
> + struct xfs_buf *agibp,
> + xfs_agino_t prev_agino,
> + xfs_agino_t next_agino)
> +{
> + struct xfs_perag *pag = agibp->b_pag;
> + struct xfs_mount *mp = pag->pag_mount;
> + struct xfs_inode *next_ip = NULL;
> + xfs_ino_t ino;
> + int error;
> +
> + ASSERT(next_agino != NULLAGINO);
> +
> +#ifdef DEBUG
> + rcu_read_lock();
> + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
> + ASSERT(next_ip == NULL);
> + rcu_read_unlock();
> +#endif
> +
> + xfs_info_ratelimited(mp,
> + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
> + next_agino, pag->pag_agno);
> +
> + /*
> + * Use an untrusted lookup to be cautious in case the AGI has been
> + * corrupted and now points at a free inode. That shouldn't happen,
> + * but we'd rather shut down now since we're already running in a weird
> + * situation.
> + */
> + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
> + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
> + if (error)
> + return error;
> +
> + /* If this is not an unlinked inode, something is very wrong. */
> + if (VFS_I(next_ip)->i_nlink != 0) {
> + error = -EFSCORRUPTED;
> + goto rele;
> + }
> +
> + next_ip->i_prev_unlinked = prev_agino;
> + trace_xfs_iunlink_reload_next(next_ip);
> +rele:
> + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
> + xfs_irele(next_ip);
> + return error;
> +}
> +
> static int
> xfs_iunlink_insert_inode(
> struct xfs_trans *tp,
> @@ -1933,6 +1995,8 @@ xfs_iunlink_insert_inode(
> * inode.
> */
> error = xfs_iunlink_update_backref(pag, agino, next_agino);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
> if (error)
> return error;
>
> @@ -2027,6 +2091,9 @@ xfs_iunlink_remove_inode(
> */
> error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
> ip->i_next_unlinked);
> + if (error == -ENOLINK)
> + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
> + ip->i_next_unlinked);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 36bd42ed9ec8..f4e46bac9b91 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3832,6 +3832,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
> __entry->new_ptr)
> );
>
> +TRACE_EVENT(xfs_iunlink_reload_next,
> + TP_PROTO(struct xfs_inode *ip),
> + TP_ARGS(ip),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_agnumber_t, agno)
> + __field(xfs_agino_t, agino)
> + __field(xfs_agino_t, prev_agino)
> + __field(xfs_agino_t, next_agino)
> + ),
> + TP_fast_assign(
> + __entry->dev = ip->i_mount->m_super->s_dev;
> + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
> + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
> + __entry->prev_agino = ip->i_prev_unlinked;
> + __entry->next_agino = ip->i_next_unlinked;
> + ),
> + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->agno,
> + __entry->agino,
> + __entry->prev_agino,
> + __entry->next_agino)
> +);
> +
> DECLARE_EVENT_CLASS(xfs_ag_inode_class,
> TP_PROTO(struct xfs_inode *ip),
> TP_ARGS(ip),
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 2/2] xfs_db: create unlinked inodes
2023-09-25 21:59 [PATCHSET 0/2] xfsprogs: reload the last iunlink item Darrick J. Wong
@ 2023-09-25 21:59 ` Darrick J. Wong
0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2023-09-25 21:59 UTC (permalink / raw)
To: djwong, cem; +Cc: Bill O'Donnell, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create an expert-mode debugger command to create unlinked inodes.
This will hopefully aid in simulation of leaked unlinked inode handling
in the kernel and elsewhere.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Bill O'Donnell <bodonnel@redhat.com>
---
db/iunlink.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++
libxfs/libxfs_api_defs.h | 1
man/man8/xfs_db.8 | 11 +++
3 files changed, 208 insertions(+)
diff --git a/db/iunlink.c b/db/iunlink.c
index 303b5dafb7d..d87562e3b0a 100644
--- a/db/iunlink.c
+++ b/db/iunlink.c
@@ -197,8 +197,204 @@ static const cmdinfo_t dump_iunlinked_cmd =
N_("[-a agno] [-b bucket] [-q] [-v]"),
N_("dump chain of unlinked inode buckets"), NULL };
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+iunlink_log_dinode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ int offset;
+ int error;
+
+ error = -libxfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ if (error)
+ return error;
+
+ dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = ip->i_imap.im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ libxfs_dinode_calc_crc(mp, dip);
+ libxfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+static int
+iunlink_insert_inode(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int offset;
+ int error;
+
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino))
+ return EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = iunlink_log_dinode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ }
+
+ /* Update the bucket. */
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ libxfs_trans_log_buf(tp, agibp, offset,
+ offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
+ */
+static int
+iunlink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_buf *agibp;
+ int error;
+
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+
+ pag = libxfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = -libxfs_read_agi(pag, tp, &agibp);
+ if (error)
+ goto out;
+
+ error = iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+ libxfs_perag_put(pag);
+ return error;
+}
+
+static int
+create_unlinked(
+ struct xfs_mount *mp)
+{
+ struct cred cr = { };
+ struct fsxattr fsx = { };
+ struct xfs_inode *ip;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_create_tmpfile, resblks,
+ 0, 0, &tp);
+ if (error) {
+ dbprintf(_("alloc trans: %s\n"), strerror(error));
+ return error;
+ }
+
+ error = -libxfs_dir_ialloc(&tp, NULL, S_IFREG | 0600, 0, 0, &cr, &fsx,
+ &ip);
+ if (error) {
+ dbprintf(_("create inode: %s\n"), strerror(error));
+ goto out_cancel;
+ }
+
+ error = iunlink(tp, ip);
+ if (error) {
+ dbprintf(_("unlink inode: %s\n"), strerror(error));
+ goto out_rele;
+ }
+
+ error = -libxfs_trans_commit(tp);
+ if (error)
+ dbprintf(_("commit inode: %s\n"), strerror(error));
+
+ dbprintf(_("Created unlinked inode %llu in agno %u\n"),
+ (unsigned long long)ip->i_ino,
+ XFS_INO_TO_AGNO(mp, ip->i_ino));
+ libxfs_irele(ip);
+ return error;
+out_rele:
+ libxfs_irele(ip);
+out_cancel:
+ libxfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+iunlink_f(
+ int argc,
+ char **argv)
+{
+ int nr = 1;
+ int c;
+ int error;
+
+ while ((c = getopt(argc, argv, "n:")) != EOF) {
+ switch (c) {
+ case 'n':
+ nr = atoi(optarg);
+ if (nr <= 0) {
+ dbprintf(_("%s: need positive number\n"));
+ return 0;
+ }
+ break;
+ default:
+ dbprintf(_("Bad option for iunlink command.\n"));
+ return 0;
+ }
+ }
+
+ for (c = 0; c < nr; c++) {
+ error = create_unlinked(mp);
+ if (error)
+ return 1;
+ }
+
+ return 0;
+}
+
+static const cmdinfo_t iunlink_cmd =
+ { "iunlink", NULL, iunlink_f, 0, -1, 0,
+ N_("[-n nr]"),
+ N_("allocate inodes and put them on the unlinked list"), NULL };
+
void
iunlink_init(void)
{
add_command(&dump_iunlinked_cmd);
+ if (expert_mode)
+ add_command(&iunlink_cmd);
}
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index ddba5c7c71d..04277c00955 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -149,6 +149,7 @@
#define xfs_prealloc_blocks libxfs_prealloc_blocks
#define xfs_read_agf libxfs_read_agf
+#define xfs_read_agi libxfs_read_agi
#define xfs_refc_block libxfs_refc_block
#define xfs_refcountbt_calc_reserves libxfs_refcountbt_calc_reserves
#define xfs_refcountbt_calc_size libxfs_refcountbt_calc_size
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 2d6d0da4d7b..f53ddd67d87 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -840,6 +840,17 @@ Set the current inode number. If no
.I inode#
is given, print the current inode number.
.TP
+.BI "iunlink [-n " nr " ]"
+Allocate inodes and put them on the unlinked list.
+
+Options include:
+.RS 1.0i
+.TP 0.4i
+.B \-n
+Create this number of unlinked inodes.
+If not specified, 1 inode will be created.
+.RE
+.TP
.BI "label [" label ]
Set the filesystem label. The filesystem label can be used by
.BR mount (8)
^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2023-09-25 21:59 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-30 15:26 [PATCH v2] xfs: load uncached unlinked inodes into memory on demand Darrick J. Wong
2023-08-30 22:29 ` Dave Chinner
2023-08-30 23:25 ` [PATCH 1/2] xfs_db: dump unlinked buckets Darrick J. Wong
2023-08-31 20:01 ` Bill O'Donnell
2023-08-30 23:25 ` [PATCH 2/2] xfs_db: create unlinked inodes Darrick J. Wong
2023-08-31 20:02 ` Bill O'Donnell
2023-08-30 23:26 ` [RFC PATCH] fstests: test unlinked inode list repair on demand Darrick J. Wong
2023-08-31 12:39 ` [PATCH v2] xfs: load uncached unlinked inodes into memory " Ritesh Harjani
2023-08-31 20:39 ` Eric Sandeen
2023-08-31 22:44 ` Darrick J. Wong
2023-08-31 17:43 ` Eric Sandeen
2023-08-31 18:10 ` Bill O'Donnell
2023-08-31 20:18 ` Bill O'Donnell
2023-09-01 14:31 ` Bill O'Donnell
-- strict thread matches above, loose matches on Subject: below --
2023-09-25 21:59 [PATCHSET 0/2] xfsprogs: reload the last iunlink item Darrick J. Wong
2023-09-25 21:59 ` [PATCH 2/2] xfs_db: create unlinked inodes Darrick J. Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox