* [PATCH 01/26] libfrog: add a function to grab the path from an open fd and a file handle
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
@ 2026-03-19 4:39 ` Darrick J. Wong
2026-03-19 4:39 ` [PATCH 02/26] libfrog: create healthmon event log library functions Darrick J. Wong
` (24 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:39 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
handle_walk_paths operates on a file handle, but requires that the fs
has been registered with libhandle via path_to_fshandle. For a normal
libhandle client this is the desirable behavior because the application
*should* maintain an open fd to the filesystem mount.
However for xfs_healer this isn't going to work well because the healer
mustn't pin the mount while it's running. It's smart enough to know how
to find and reconnect to the mountpoint, but libhandle doesn't have any
such concept.
Therefore, alter the libfrog getparents code so that xfs_healer can pass
in the mountpoint and reconnected fd without needing libhandle. All
we're really doing here is trying to obtain a user-visible path for a
file that encountered problems for logging purposes; if it fails, we'll
fall back to logging the inode number.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
libfrog/getparents.h | 4 ++
libfrog/getparents.c | 93 ++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 82 insertions(+), 15 deletions(-)
diff --git a/libfrog/getparents.h b/libfrog/getparents.h
index 8098d594219b4c..e1df30889c7606 100644
--- a/libfrog/getparents.h
+++ b/libfrog/getparents.h
@@ -39,4 +39,8 @@ int fd_to_path(int fd, size_t ioctl_bufsize, char *path, size_t pathlen);
int handle_to_path(const void *hanp, size_t hlen, size_t ioctl_bufsize,
char *path, size_t pathlen);
+int handle_walk_paths_fd(const char *mntpt, int mntfd, const void *hanp,
+ size_t hanlen, size_t ioctl_bufsize, walk_path_fn fn,
+ void *arg);
+
#endif /* __LIBFROG_GETPARENTS_H_ */
diff --git a/libfrog/getparents.c b/libfrog/getparents.c
index 9118b0ff32db0d..e8f545392634e4 100644
--- a/libfrog/getparents.c
+++ b/libfrog/getparents.c
@@ -112,9 +112,13 @@ fd_walk_parents(
return ret;
}
-/* Walk all parent pointers of this handle. Returns 0 or positive errno. */
-int
-handle_walk_parents(
+/*
+ * Walk all parent pointers of this handle using the given fd to query the
+ * filesystem. Returns 0 or positive errno.
+ */
+static int
+handle_walk_parents_fd(
+ int fd,
const void *hanp,
size_t hlen,
size_t bufsize,
@@ -123,21 +127,11 @@ handle_walk_parents(
{
struct xfs_getparents_by_handle gph = { };
void *buf;
- char *mntpt;
- int fd;
int ret;
if (hlen != sizeof(struct xfs_handle))
return EINVAL;
- /*
- * This function doesn't modify the handle, but we don't want to have
- * to bump the libhandle major version just to change that.
- */
- fd = handle_to_fsfd((void *)hanp, &mntpt);
- if (fd < 0)
- return errno;
-
buf = alloc_records(&gph.gph_request, bufsize);
if (!buf)
return errno;
@@ -158,6 +152,29 @@ handle_walk_parents(
return ret;
}
+/* Walk all parent pointers of this handle. Returns 0 or positive errno. */
+int
+handle_walk_parents(
+ const void *hanp,
+ size_t hlen,
+ size_t bufsize,
+ walk_parent_fn fn,
+ void *arg)
+{
+ char *mntpt;
+ int fd;
+
+ /*
+ * This function doesn't modify the handle, but we don't want to have
+ * to bump the libhandle major version just to change that.
+ */
+ fd = handle_to_fsfd((void *)hanp, &mntpt);
+ if (fd < 0)
+ return errno;
+
+ return handle_walk_parents_fd(fd, hanp, hlen, bufsize, fn, arg);
+}
+
struct walk_ppaths_info {
/* Callback */
walk_path_fn fn;
@@ -169,7 +186,11 @@ struct walk_ppaths_info {
/* Path that we're constructing. */
struct path_list *path;
+ /* Use this much memory per call. */
size_t ioctl_bufsize;
+
+ /* Use this fd for calling the getparents ioctl. */
+ int mntfd;
};
/*
@@ -200,8 +221,14 @@ find_parent_component(
return errno;
path_list_add_parent_component(wpi->path, pc);
- ret = handle_walk_parents(&rec->p_handle, sizeof(rec->p_handle),
- wpi->ioctl_bufsize, find_parent_component, wpi);
+ if (wpi->mntfd >= 0)
+ ret = handle_walk_parents_fd(wpi->mntfd, &rec->p_handle,
+ sizeof(rec->p_handle), wpi->ioctl_bufsize,
+ find_parent_component, wpi);
+ else
+ ret = handle_walk_parents(&rec->p_handle,
+ sizeof(rec->p_handle), wpi->ioctl_bufsize,
+ find_parent_component, wpi);
path_list_del_component(wpi->path, pc);
path_component_free(pc);
@@ -222,6 +249,7 @@ handle_walk_paths(
{
struct walk_ppaths_info wpi = {
.ioctl_bufsize = ioctl_bufsize,
+ .mntfd = -1,
};
int ret;
@@ -246,6 +274,41 @@ handle_walk_paths(
return ret;
}
+/*
+ * Call the given function on all known paths from the vfs root to the inode
+ * described in the handle using an already open mountpoint and fd. Returns 0
+ * for success or positive errno.
+ */
+int
+handle_walk_paths_fd(
+ const char *mntpt,
+ int mntfd,
+ const void *hanp,
+ size_t hlen,
+ size_t ioctl_bufsize,
+ walk_path_fn fn,
+ void *arg)
+{
+ struct walk_ppaths_info wpi = {
+ .ioctl_bufsize = ioctl_bufsize,
+ .mntfd = mntfd,
+ .mntpt = (char *)mntpt,
+ };
+ int ret;
+
+ wpi.path = path_list_init();
+ if (!wpi.path)
+ return errno;
+ wpi.fn = fn;
+ wpi.arg = arg;
+
+ ret = handle_walk_parents_fd(mntfd, hanp, hlen, ioctl_bufsize,
+ find_parent_component, &wpi);
+
+ path_list_free(wpi.path);
+ return ret;
+}
+
/*
* Call the given function on all known paths from the vfs root to the inode
* referred to by the file description. Returns 0 or positive errno.
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 02/26] libfrog: create healthmon event log library functions
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
2026-03-19 4:39 ` [PATCH 01/26] libfrog: add a function to grab the path from an open fd and a file handle Darrick J. Wong
@ 2026-03-19 4:39 ` Darrick J. Wong
2026-03-19 4:39 ` [PATCH 03/26] libfrog: add support code for starting systemd services programmatically Darrick J. Wong
` (23 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:39 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add some helper functions to log health monitoring events so that xfs_io
and xfs_healer can share logging code.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
libfrog/flagmap.h | 20 +++
libfrog/healthevent.h | 43 ++++++
libfrog/Makefile | 4 +
libfrog/flagmap.c | 62 ++++++++
libfrog/healthevent.c | 360 +++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 489 insertions(+)
create mode 100644 libfrog/flagmap.h
create mode 100644 libfrog/healthevent.h
create mode 100644 libfrog/flagmap.c
create mode 100644 libfrog/healthevent.c
diff --git a/libfrog/flagmap.h b/libfrog/flagmap.h
new file mode 100644
index 00000000000000..8031d75a7c02a8
--- /dev/null
+++ b/libfrog/flagmap.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef LIBFROG_FLAGMAP_H_
+#define LIBFROG_FLAGMAP_H_
+
+struct flag_map {
+ unsigned long long flag;
+ const char *string;
+};
+
+void mask_to_string(const struct flag_map *map, unsigned long long mask,
+ const char *delimiter, char *buf, size_t bufsize);
+
+const char *value_to_string(const struct flag_map *map,
+ unsigned long long value);
+
+#endif /* LIBFROG_FLAGMAP_H_ */
diff --git a/libfrog/healthevent.h b/libfrog/healthevent.h
new file mode 100644
index 00000000000000..6de41bc797100c
--- /dev/null
+++ b/libfrog/healthevent.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef LIBFROG_HEALTHEVENT_H_
+#define LIBFROG_HEALTHEVENT_H_
+
+struct hme_prefix {
+ /*
+ * Format a complete file path into this buffer to prevent the logging
+ * code from printing the mountpoint and a file handle. Only works for
+ * file-related events.
+ */
+ char path[MAXPATHLEN];
+
+ /* Set this to the mountpoint */
+ const char *mountpoint;
+};
+
+static inline bool hme_prefix_has_path(const struct hme_prefix *pfx)
+{
+ return pfx->path[0] != 0;
+}
+
+static inline void hme_prefix_clear_path(struct hme_prefix *pfx)
+{
+ pfx->path[0] = 0;
+}
+
+static inline void
+hme_prefix_init(
+ struct hme_prefix *pfx,
+ const char *mountpoint)
+{
+ pfx->mountpoint = mountpoint;
+ hme_prefix_clear_path(pfx);
+}
+
+void hme_report_event(const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme);
+
+#endif /* LIBFROG_HEALTHEVENT_H_ */
diff --git a/libfrog/Makefile b/libfrog/Makefile
index 927bd8d0957fab..bccd9289e5dd79 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -19,11 +19,13 @@ bulkstat.c \
convert.c \
crc32.c \
file_exchange.c \
+flagmap.c \
fsgeom.c \
fsproperties.c \
fsprops.c \
getparents.c \
histogram.c \
+healthevent.c \
file_attr.c \
list_sort.c \
linux.c \
@@ -51,11 +53,13 @@ dahashselftest.h \
div64.h \
fakelibattr.h \
file_exchange.h \
+flagmap.h \
fsgeom.h \
fsproperties.h \
fsprops.h \
getparents.h \
handle_priv.h \
+healthevent.h \
histogram.h \
file_attr.h \
logging.h \
diff --git a/libfrog/flagmap.c b/libfrog/flagmap.c
new file mode 100644
index 00000000000000..631c4bbc8f1dc0
--- /dev/null
+++ b/libfrog/flagmap.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include "platform_defs.h"
+#include "libfrog/flagmap.h"
+
+/*
+ * Given a mapping of bits to strings and a bitmask, format the bitmask as a
+ * list of strings and hexadecimal number representing bits not mapped to any
+ * string. The output will be truncated if buf is not large enough.
+ */
+void
+mask_to_string(
+ const struct flag_map *map,
+ unsigned long long mask,
+ const char *delimiter,
+ char *buf,
+ size_t bufsize)
+{
+ const char *tag = "";
+ unsigned long long seen = 0;
+ int w;
+
+ for (; map->string; map++) {
+ seen |= map->flag;
+
+ if (mask & map->flag) {
+ w = snprintf(buf, bufsize, "%s%s", tag, _(map->string));
+ if (w > bufsize)
+ return;
+
+ buf += w;
+ bufsize -= w;
+
+ tag = delimiter;
+ }
+ }
+
+ if (mask & ~seen)
+ snprintf(buf, bufsize, "%s0x%llx", tag, mask & ~seen);
+}
+
+/*
+ * Given a mapping of values to strings and a value, return the matching string
+ * or confusion.
+ */
+const char *
+value_to_string(
+ const struct flag_map *map,
+ unsigned long long value)
+{
+ for (; map->string; map++) {
+ if (value == map->flag)
+ return _(map->string);
+ }
+
+ return _("unknown value");
+}
diff --git a/libfrog/healthevent.c b/libfrog/healthevent.c
new file mode 100644
index 00000000000000..8520cb3218fb03
--- /dev/null
+++ b/libfrog/healthevent.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include "platform_defs.h"
+#include "libfrog/healthevent.h"
+#include "libfrog/flagmap.h"
+
+/*
+ * The healthmon log string format is as follows:
+ *
+ * WHICH OBJECT: STATUS
+ *
+ * /mnt: 32 events lost
+ * /mnt agno 0x5 bnobt, rmapbt: sick
+ * /mnt rgno 0x5 bitmap: sick
+ * /mnt ino 13 gen 0x3 bmbtd: sick
+ * /mnt/a bmbtd: sick
+ * /mnt ino 13 gen 0x3 pos 4096 len 4096: directio_write failed
+ * /mnt/a pos 4096 len 4096: directio_read failed
+ * /mnt datadev daddr 0x13 bbcount 0x5: media error
+ * /mnt: filesystem shut down due to shenanigans, badness
+ */
+
+static const struct flag_map device_domains[] = {
+ { XFS_HEALTH_MONITOR_DOMAIN_DATADEV, N_("datadev") },
+ { XFS_HEALTH_MONITOR_DOMAIN_RTDEV, N_("rtdev") },
+ { XFS_HEALTH_MONITOR_DOMAIN_LOGDEV, N_("logdev") },
+ {0, NULL},
+};
+
+static inline const char *
+device_domain_string(
+ uint32_t domain)
+{
+ return value_to_string(device_domains, domain);
+}
+
+static const struct flag_map fileio_types[] = {
+ { XFS_HEALTH_MONITOR_TYPE_BUFREAD, N_("buffered_read") },
+ { XFS_HEALTH_MONITOR_TYPE_BUFWRITE, N_("buffered_write") },
+ { XFS_HEALTH_MONITOR_TYPE_DIOREAD, N_("directio_read") },
+ { XFS_HEALTH_MONITOR_TYPE_DIOWRITE, N_("directio_write") },
+ { XFS_HEALTH_MONITOR_TYPE_DATALOST, N_("media") },
+ {0, NULL},
+};
+
+static inline const char *
+fileio_type_string(
+ uint32_t type)
+{
+ return value_to_string(fileio_types, type);
+}
+
+static const struct flag_map health_types[] = {
+ { XFS_HEALTH_MONITOR_TYPE_SICK, N_("sick") },
+ { XFS_HEALTH_MONITOR_TYPE_CORRUPT, N_("corrupt") },
+ { XFS_HEALTH_MONITOR_TYPE_HEALTHY, N_("healthy") },
+ {0, NULL},
+};
+
+static inline const char *
+health_type_string(
+ uint32_t type)
+{
+ return value_to_string(health_types, type);
+}
+
+/* Report that the kernel lost events. */
+static void
+report_lost(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ printf("%s: %llu %s\n", pfx->mountpoint,
+ (unsigned long long)hme->e.lost.count,
+ _("events lost"));
+ fflush(stdout);
+}
+
+/* Report that the monitor is running. */
+static void
+report_running(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ printf("%s: %s\n", pfx->mountpoint, _("monitoring started"));
+ fflush(stdout);
+}
+
+/* Report that the filesystem was unmounted. */
+static void
+report_unmounted(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ printf("%s: %s\n", pfx->mountpoint, _("filesystem unmounted"));
+ fflush(stdout);
+}
+
+static const struct flag_map shutdown_reasons[] = {
+ { XFS_HEALTH_SHUTDOWN_META_IO_ERROR, N_("metadata I/O error") },
+ { XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR, N_("log I/O error") },
+ { XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT, N_("forced unmount") },
+ { XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE, N_("in-memory state corruption") },
+ { XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK, N_("ondisk metadata corruption") },
+ { XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED, N_("device removed") },
+ {0, NULL},
+};
+
+/* Report an abortive shutdown of the filesystem. */
+static void
+report_shutdown(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ char buf[512];
+
+ mask_to_string(shutdown_reasons, hme->e.shutdown.reasons, ", ", buf,
+ sizeof(buf));
+
+ printf("%s: %s %s\n", pfx->mountpoint,
+ _("filesystem shut down due to"), buf);
+ fflush(stdout);
+}
+
+static const struct flag_map inode_structs[] = {
+ { XFS_BS_SICK_INODE, N_("core") },
+ { XFS_BS_SICK_BMBTD, N_("datafork") },
+ { XFS_BS_SICK_BMBTA, N_("attrfork") },
+ { XFS_BS_SICK_BMBTC, N_("cowfork") },
+ { XFS_BS_SICK_DIR, N_("directory") },
+ { XFS_BS_SICK_XATTR, N_("xattr") },
+ { XFS_BS_SICK_SYMLINK, N_("symlink") },
+ { XFS_BS_SICK_PARENT, N_("parent") },
+ { XFS_BS_SICK_DIRTREE, N_("dirtree") },
+ {0, NULL},
+};
+
+/* Report inode metadata corruption */
+static void
+report_inode(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ char buf[512];
+
+ mask_to_string(inode_structs, hme->e.inode.mask, ", ", buf,
+ sizeof(buf));
+
+ if (hme_prefix_has_path(pfx))
+ printf("%s %s: %s\n",
+ pfx->path,
+ buf,
+ health_type_string(hme->type));
+ else
+ printf("%s %s %llu %s 0x%x %s: %s\n",
+ pfx->mountpoint,
+ _("ino"),
+ (unsigned long long)hme->e.inode.ino,
+ _("gen"),
+ hme->e.inode.gen,
+ buf,
+ health_type_string(hme->type));
+ fflush(stdout);
+}
+
+static const struct flag_map ag_structs[] = {
+ { XFS_AG_GEOM_SICK_SB, N_("super") },
+ { XFS_AG_GEOM_SICK_AGF, N_("agf") },
+ { XFS_AG_GEOM_SICK_AGFL, N_("agfl") },
+ { XFS_AG_GEOM_SICK_AGI, N_("agi") },
+ { XFS_AG_GEOM_SICK_BNOBT, N_("bnobt") },
+ { XFS_AG_GEOM_SICK_CNTBT, N_("cntbt") },
+ { XFS_AG_GEOM_SICK_INOBT, N_("inobt") },
+ { XFS_AG_GEOM_SICK_FINOBT, N_("finobt") },
+ { XFS_AG_GEOM_SICK_RMAPBT, N_("rmapbt") },
+ { XFS_AG_GEOM_SICK_REFCNTBT, N_("refcountbt") },
+ { XFS_AG_GEOM_SICK_INODES, N_("inodes") },
+ {0, NULL},
+};
+
+/* Report AG metadata corruption */
+static void
+report_ag(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ char buf[512];
+
+ mask_to_string(ag_structs, hme->e.group.mask, ", ", buf,
+ sizeof(buf));
+
+ printf("%s %s 0x%x %s: %s\n",
+ pfx->mountpoint,
+ _("agno"),
+ hme->e.group.gno,
+ buf,
+ health_type_string(hme->type));
+ fflush(stdout);
+}
+
+static const struct flag_map rtgroup_structs[] = {
+ { XFS_RTGROUP_GEOM_SICK_SUPER, N_("super") },
+ { XFS_RTGROUP_GEOM_SICK_BITMAP, N_("bitmap") },
+ { XFS_RTGROUP_GEOM_SICK_SUMMARY, N_("summary") },
+ { XFS_RTGROUP_GEOM_SICK_RMAPBT, N_("rmapbt") },
+ { XFS_RTGROUP_GEOM_SICK_REFCNTBT, N_("refcountbt") },
+ {0, NULL},
+};
+
+/* Report rtgroup metadata corruption */
+static void
+report_rtgroup(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ char buf[512];
+
+ mask_to_string(rtgroup_structs, hme->e.group.mask, ", ", buf,
+ sizeof(buf));
+
+ printf("%s %s 0x%x %s: %s\n",
+ pfx->mountpoint,
+ _("rgno"),
+ hme->e.group.gno,
+ buf, health_type_string(hme->type));
+ fflush(stdout);
+}
+
+static const struct flag_map fs_structs[] = {
+ { XFS_FSOP_GEOM_SICK_COUNTERS, N_("fscounters") },
+ { XFS_FSOP_GEOM_SICK_UQUOTA, N_("usrquota") },
+ { XFS_FSOP_GEOM_SICK_GQUOTA, N_("grpquota") },
+ { XFS_FSOP_GEOM_SICK_PQUOTA, N_("prjquota") },
+ { XFS_FSOP_GEOM_SICK_RT_BITMAP, N_("bitmap") },
+ { XFS_FSOP_GEOM_SICK_RT_SUMMARY, N_("summary") },
+ { XFS_FSOP_GEOM_SICK_QUOTACHECK, N_("quotacheck") },
+ { XFS_FSOP_GEOM_SICK_NLINKS, N_("nlinks") },
+ { XFS_FSOP_GEOM_SICK_METADIR, N_("metadir") },
+ { XFS_FSOP_GEOM_SICK_METAPATH, N_("metapath") },
+ {0, NULL},
+};
+
+/* Report fs-wide metadata corruption */
+static void
+report_fs(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ char buf[512];
+
+ mask_to_string(fs_structs, hme->e.fs.mask, ", ", buf, sizeof(buf));
+
+ printf("%s %s: %s\n",
+ pfx->mountpoint,
+ buf,
+ health_type_string(hme->type));
+ fflush(stdout);
+}
+
+/* Report device media corruption */
+static void
+report_device_error(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ printf("%s %s %s 0x%llx %s 0x%llx: %s\n", pfx->mountpoint,
+ device_domain_string(hme->domain),
+ _("daddr"),
+ (unsigned long long)hme->e.media.daddr,
+ _("bbcount"),
+ (unsigned long long)hme->e.media.bbcount,
+ _("media error"));
+ fflush(stdout);
+}
+
+/* Report file range errors */
+static void
+report_file_range(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ if (hme_prefix_has_path(pfx))
+ printf("%s ", pfx->path);
+ else
+ printf("%s %s %llu %s 0x%x ",
+ pfx->mountpoint,
+ _("ino"),
+ (unsigned long long)hme->e.filerange.ino,
+ _("gen"),
+ hme->e.filerange.gen);
+ if (hme->type != XFS_HEALTH_MONITOR_TYPE_DATALOST &&
+ hme->e.filerange.error)
+ printf("%s %llu %s %llu: %s: %s\n",
+ _("pos"),
+ (unsigned long long)hme->e.filerange.pos,
+ _("len"),
+ (unsigned long long)hme->e.filerange.len,
+ fileio_type_string(hme->type),
+ strerror(hme->e.filerange.error));
+ else
+ printf("%s %llu %s %llu: %s %s\n",
+ _("pos"),
+ (unsigned long long)hme->e.filerange.pos,
+ _("len"),
+ (unsigned long long)hme->e.filerange.len,
+ fileio_type_string(hme->type),
+ _("failed"));
+ fflush(stdout);
+}
+
+/* Log a health monitoring event to stdout. */
+void
+hme_report_event(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ switch (hme->domain) {
+ case XFS_HEALTH_MONITOR_DOMAIN_MOUNT:
+ switch (hme->type) {
+ case XFS_HEALTH_MONITOR_TYPE_LOST:
+ report_lost(pfx, hme);
+ return;
+ case XFS_HEALTH_MONITOR_TYPE_RUNNING:
+ report_running(pfx, hme);
+ return;
+ case XFS_HEALTH_MONITOR_TYPE_UNMOUNT:
+ report_unmounted(pfx, hme);
+ return;
+ case XFS_HEALTH_MONITOR_TYPE_SHUTDOWN:
+ report_shutdown(pfx, hme);
+ return;
+ }
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_INODE:
+ report_inode(pfx, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_AG:
+ report_ag(pfx, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP:
+ report_rtgroup(pfx, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_FS:
+ report_fs(pfx, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_DATADEV:
+ case XFS_HEALTH_MONITOR_DOMAIN_RTDEV:
+ case XFS_HEALTH_MONITOR_DOMAIN_LOGDEV:
+ report_device_error(pfx, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_FILERANGE:
+ report_file_range(pfx, hme);
+ break;
+ }
+}
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 03/26] libfrog: add support code for starting systemd services programmatically
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
2026-03-19 4:39 ` [PATCH 01/26] libfrog: add a function to grab the path from an open fd and a file handle Darrick J. Wong
2026-03-19 4:39 ` [PATCH 02/26] libfrog: create healthmon event log library functions Darrick J. Wong
@ 2026-03-19 4:39 ` Darrick J. Wong
2026-03-19 4:39 ` [PATCH 04/26] libfrog: hoist a couple of service helper functions Darrick J. Wong
` (22 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:39 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add some simple routines for computing the name of systemd service
instances and starting systemd services. These will be used by the
xfs_healer_start service to start per-filesystem xfs_healer service
instances.
Note that we run systemd helper programs as subprocesses for a couple of
reasons. First, the path-escaping functionality is not a part of any
library-accessible API, which means it can only be accessed via
systemd-escape(1). Second, although the service startup functionality
can be reached via dbus, doing so would introduce a new library
dependency. Systemd is also undergoing a dbus -> varlink RPC transition
so we avoid that mess by calling the cli systemctl(1) program.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
libfrog/systemd.h | 20 ++++++
configure.ac | 1
include/builddefs.in | 1
libfrog/Makefile | 6 ++
libfrog/systemd.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
m4/package_libcdev.m4 | 19 +++++
6 files changed, 224 insertions(+)
create mode 100644 libfrog/systemd.h
create mode 100644 libfrog/systemd.c
diff --git a/libfrog/systemd.h b/libfrog/systemd.h
new file mode 100644
index 00000000000000..4f414bc3c1e9c3
--- /dev/null
+++ b/libfrog/systemd.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 Oracle. All rights reserved.
+ * All Rights Reserved.
+ */
+#ifndef __LIBFROG_SYSTEMD_H__
+#define __LIBFROG_SYSTEMD_H__
+
+int systemd_path_instance_unit_name(const char *unit_template,
+ const char *path, char *unitname, size_t unitnamelen);
+
+enum systemd_unit_manage {
+ UM_STOP,
+ UM_START,
+ UM_RESTART,
+};
+
+int systemd_manage_unit(enum systemd_unit_manage how, const char *unitname);
+
+#endif /* __LIBFROG_SYSTEMD_H__ */
diff --git a/configure.ac b/configure.ac
index 8092b8656ef94b..a9febabc71cfc7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -182,6 +182,7 @@ AC_CONFIG_UDEV_RULE_DIR
AC_HAVE_BLKID_TOPO
AC_HAVE_TRIVIAL_AUTO_VAR_INIT
AC_STRERROR_R_RETURNS_STRING
+AC_HAVE_CLOSE_RANGE
if test "$enable_ubsan" = "yes" || test "$enable_ubsan" = "probe"; then
AC_PACKAGE_CHECK_UBSAN
diff --git a/include/builddefs.in b/include/builddefs.in
index b38a099b7d525a..4a2cb757c0bdb3 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -118,6 +118,7 @@ HAVE_UDEV = @have_udev@
UDEV_RULE_DIR = @udev_rule_dir@
HAVE_LIBURCU_ATOMIC64 = @have_liburcu_atomic64@
STRERROR_R_RETURNS_STRING = @strerror_r_returns_string@
+HAVE_CLOSE_RANGE = @have_close_range@
GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
# -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
diff --git a/libfrog/Makefile b/libfrog/Makefile
index bccd9289e5dd79..89a0332ae85372 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -36,6 +36,7 @@ ptvar.c \
radix-tree.c \
randbytes.c \
scrub.c \
+systemd.c \
util.c \
workqueue.c \
zones.c
@@ -70,6 +71,7 @@ radix-tree.h \
randbytes.h \
scrub.h \
statx.h \
+systemd.h \
workqueue.h \
zones.h
@@ -90,6 +92,10 @@ ifeq ($(HAVE_GETRANDOM_NONBLOCK),yes)
LCFLAGS += -DHAVE_GETRANDOM_NONBLOCK
endif
+ifeq ($(HAVE_CLOSE_RANGE),yes)
+CFLAGS += -DHAVE_CLOSE_RANGE
+endif
+
default: ltdepend $(LTLIBRARY) $(GETTEXT_PY)
crc32table.h: gen_crc32table.c crc32defs.h
diff --git a/libfrog/systemd.c b/libfrog/systemd.c
new file mode 100644
index 00000000000000..2d2d2e9be72e6a
--- /dev/null
+++ b/libfrog/systemd.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "libfrog/systemd.h"
+
+/* Close all fds except for the three standard ones. */
+static void
+close_fds(void)
+{
+ int max_fd = sysconf(_SC_OPEN_MAX);
+ int fd;
+
+ if (max_fd < 1)
+ max_fd = 1024;
+
+#ifdef HAVE_CLOSE_RANGE
+ if (close_range(STDERR_FILENO + 1, max_fd, 0) == 0)
+ return;
+#endif
+
+ for (fd = STDERR_FILENO + 1; fd < max_fd; fd++)
+ close(fd);
+}
+
+/*
+ * Compute the systemd instance unit name for a given path.
+ *
+ * The escaping logic is implemented directly in systemctl so there's no
+ * library or dbus service that we can call.
+ */
+int
+systemd_path_instance_unit_name(
+ const char *unit_template,
+ const char *path,
+ char *unitname,
+ size_t unitnamelen)
+{
+ size_t i;
+ ssize_t bytes;
+ pid_t child_pid;
+ int pipe_fds[2];
+ int child_status;
+ int ret;
+
+ ret = pipe(pipe_fds);
+ if (ret)
+ return -1;
+
+ child_pid = fork();
+ if (child_pid < 0)
+ return -1;
+
+ if (!child_pid) {
+ /* child process */
+ char *argv[] = {
+ "systemd-escape",
+ "--template",
+ (char *)unit_template,
+ "--path",
+ (char *)path,
+ NULL,
+ };
+
+ ret = dup2(pipe_fds[1], STDOUT_FILENO);
+ if (ret < 0) {
+ perror(path);
+ goto fail;
+ }
+
+ close_fds();
+
+ ret = execvp("systemd-escape", argv);
+ if (ret)
+ perror(path);
+
+fail:
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Close our connection to stdin so that the read won't hang if the
+ * child exits without writing anything to stdout.
+ */
+ close(pipe_fds[1]);
+ bytes = read(pipe_fds[0], unitname, unitnamelen - 1);
+ close(pipe_fds[0]);
+
+ waitpid(child_pid, &child_status, 0);
+ if (!WIFEXITED(child_status) || WEXITSTATUS(child_status) != 0) {
+ errno = 0;
+ return -1;
+ }
+
+ /* Terminate string at first newline or end of buffer. */
+ for (i = 0; i < bytes; i++) {
+ if (unitname[i] == '\n') {
+ unitname[i] = 0;
+ break;
+ }
+ }
+ if (i == bytes)
+ unitname[unitnamelen - 1] = 0;
+
+ return 0;
+}
+
+static const char *systemd_unit_manage_string(enum systemd_unit_manage how)
+{
+ switch (how) {
+ case UM_STOP:
+ return "stop";
+ case UM_START:
+ return "start";
+ case UM_RESTART:
+ return "restart";
+ }
+
+ /* shut up gcc */
+ return NULL;
+}
+
+/*
+ * Start/stop/restart a systemd unit and let it run in the background.
+ *
+ * systemctl start wraps a lot of logic around starting a unit, so it's less
+ * work for xfsprogs to invoke systemctl instead of calling through dbus.
+ */
+int
+systemd_manage_unit(
+ enum systemd_unit_manage how,
+ const char *unitname)
+{
+ pid_t child_pid;
+ int child_status;
+ int ret;
+
+ child_pid = fork();
+ if (child_pid < 0)
+ return -1;
+
+ if (!child_pid) {
+ /* child starts the process */
+ char *argv[] = {
+ "systemctl",
+ (char *)systemd_unit_manage_string(how),
+ "--no-block",
+ (char *)unitname,
+ NULL,
+ };
+
+ close_fds();
+
+ ret = execvp("systemctl", argv);
+ if (ret)
+ perror("systemctl");
+
+ exit(EXIT_FAILURE);
+ }
+
+ /* parent waits for process */
+ waitpid(child_pid, &child_status, 0);
+
+ /* systemctl (stop/start/restart) --no-block should return quickly */
+ if (WIFEXITED(child_status) && WEXITSTATUS(child_status) == 0)
+ return 0;
+
+ errno = ENOMEM;
+ return -1;
+}
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index c5538c30d2518a..b3d87229d3367a 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -347,3 +347,22 @@ puts(strerror_r(0, buf, sizeof(buf)));
CFLAGS="$OLD_CFLAGS"
AC_SUBST(strerror_r_returns_string)
])
+
+#
+# Check if close_range exists
+#
+AC_DEFUN([AC_HAVE_CLOSE_RANGE],
+ [AC_MSG_CHECKING([for close_range])
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <linux/close_range.h>
+ ]], [[
+close_range(0, 0, 0);
+ ]])
+ ], have_close_range=yes
+ AC_MSG_RESULT(yes),
+ AC_MSG_RESULT(no))
+ AC_SUBST(have_close_range)
+ ])
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 04/26] libfrog: hoist a couple of service helper functions
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (2 preceding siblings ...)
2026-03-19 4:39 ` [PATCH 03/26] libfrog: add support code for starting systemd services programmatically Darrick J. Wong
@ 2026-03-19 4:39 ` Darrick J. Wong
2026-03-19 4:40 ` [PATCH 05/26] libfrog: add wrappers for listmount and statmount Darrick J. Wong
` (21 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:39 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Hoist a couple of service/daemon-related helper functions to libfrog so
that we can share the code between xfs_scrub and xfs_healer.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
libfrog/systemd.h | 28 ++++++++++++++++++++++++++++
scrub/xfs_scrub.c | 32 +++++++++-----------------------
2 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/libfrog/systemd.h b/libfrog/systemd.h
index 4f414bc3c1e9c3..c96df4afa39aa6 100644
--- a/libfrog/systemd.h
+++ b/libfrog/systemd.h
@@ -17,4 +17,32 @@ enum systemd_unit_manage {
int systemd_manage_unit(enum systemd_unit_manage how, const char *unitname);
+static inline bool systemd_is_service(void)
+{
+ return getenv("SERVICE_MODE") != NULL;
+}
+
+/* Special processing for a service/daemon program that is exiting. */
+static inline int
+systemd_service_exit(int ret)
+{
+ /*
+ * We have to sleep 2 seconds here because journald uses the pid to
+ * connect our log messages to the systemd service. This is critical
+ * for capturing all the log messages if the service fails, because
+ * failure analysis tools use the service name to gather log messages
+ * for reporting.
+ */
+ sleep(2);
+
+ /*
+ * If we're being run as a service, the return code must fit the LSB
+ * init script action error guidelines, which is to say that we
+ * compress all errors to 1 ("generic or unspecified error", LSB 5.0
+ * section 22.2) and hope the admin will scan the log for what actually
+ * happened.
+ */
+ return ret != 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
#endif /* __LIBFROG_SYSTEMD_H__ */
diff --git a/scrub/xfs_scrub.c b/scrub/xfs_scrub.c
index 3dba972a7e8d2a..79937aa8cce4c4 100644
--- a/scrub/xfs_scrub.c
+++ b/scrub/xfs_scrub.c
@@ -19,6 +19,7 @@
#include "unicrash.h"
#include "progress.h"
#include "libfrog/histogram.h"
+#include "libfrog/systemd.h"
/*
* XFS Online Metadata Scrub (and Repair)
@@ -866,8 +867,7 @@ main(
if (stdout_isatty && !progress_fp)
progress_fp = fdopen(1, "w+");
- if (getenv("SERVICE_MODE"))
- is_service = true;
+ is_service = systemd_is_service();
/* Initialize overall phase stats. */
error = phase_start(&all_pi, 0, NULL);
@@ -960,29 +960,15 @@ main(
hist_free(&ctx.datadev_hist);
hist_free(&ctx.rtdev_hist);
- /*
- * If we're being run as a service, the return code must fit the LSB
- * init script action error guidelines, which is to say that we
- * compress all errors to 1 ("generic or unspecified error", LSB 5.0
- * section 22.2) and hope the admin will scan the log for what
- * actually happened.
- *
- * We have to sleep 2 seconds here because journald uses the pid to
- * connect our log messages to the systemd service. This is critical
- * for capturing all the log messages if the scrub fails, because the
- * fail service uses the service name to gather log messages for the
- * error report.
- *
- * Note: We don't count a lack of kernel support as a service failure
- * because we haven't determined that there's anything wrong with the
- * filesystem.
- */
if (is_service) {
- sleep(2);
+ /*
+ * Note: We don't count a lack of kernel support as a service
+ * failure because we haven't determined that there's anything
+ * wrong with the filesystem.
+ */
if (!ctx.scrub_setup_succeeded)
- return 0;
- if (ret != SCRUB_RET_SUCCESS)
- return 1;
+ ret = 0;
+ return systemd_service_exit(ret);
}
return ret;
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 05/26] libfrog: add wrappers for listmount and statmount
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (3 preceding siblings ...)
2026-03-19 4:39 ` [PATCH 04/26] libfrog: hoist a couple of service helper functions Darrick J. Wong
@ 2026-03-19 4:40 ` Darrick J. Wong
2026-03-19 4:40 ` [PATCH 06/26] man2: document the healthmon ioctl Darrick J. Wong
` (20 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:40 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add some wrappers for listmount and statmount so that we don't have to
open-code the kernel ABI quirks in every utility program that uses it.
Note that glibc seems to have discussed providing a wrapper in late 2023
but took no action; and the listmount manpage says that there is no
glibc wrapper.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
include/linux.h | 8 +++-
libfrog/statmount.h | 104 +++++++++++++++++++++++++++++++++++++++++++++++++
configure.ac | 5 ++
include/builddefs.in | 7 +++
libfrog/Makefile | 9 ++++
libfrog/statmount.c | 76 ++++++++++++++++++++++++++++++++++++
m4/package_libcdev.m4 | 86 +++++++++++++++++++++++++++++++++++++++++
7 files changed, 294 insertions(+), 1 deletion(-)
create mode 100644 libfrog/statmount.h
create mode 100644 libfrog/statmount.c
diff --git a/include/linux.h b/include/linux.h
index 3ea9016272e688..8972c9596c75f5 100644
--- a/include/linux.h
+++ b/include/linux.h
@@ -32,7 +32,13 @@
#ifdef OVERRIDE_SYSTEM_FSXATTR
# define fsxattr sys_fsxattr
#endif
-#include <linux/fs.h> /* fsxattr defintion for new kernels */
+#ifdef OVERRIDE_SYSTEM_STATMOUNT
+# define statmount sys_statmount
+#endif
+#include <linux/fs.h> /* fsxattr/statmount defintion for new kernels */
+#ifdef OVERRIDE_SYSTEM_STATMOUNT
+# undef statmount
+#endif
#ifdef OVERRIDE_SYSTEM_FSXATTR
# undef fsxattr
#endif
diff --git a/libfrog/statmount.h b/libfrog/statmount.h
new file mode 100644
index 00000000000000..7e281ce93029ff
--- /dev/null
+++ b/libfrog/statmount.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 Oracle. All rights reserved.
+ * All Rights Reserved.
+ */
+#ifndef __LIBFROG_STATMOUNT_H__
+#define __LIBFROG_STATMOUNT_H__
+
+/* This is the path to the current process' mount namespace file */
+#define DEFAULT_MOUNTNS_FILE "/proc/self/ns/mnt"
+
+/*
+ * Believe it or not, listmount and statmount treat a zero value for mnt_ns_fd
+ * as if that means "use the current process' mount namespace" even though
+ * Linus Torvalds roared about that with the BPF people.
+ */
+#define DEFAULT_MOUNTNS_FD (0)
+
+#ifdef OVERRIDE_SYSTEM_STATMOUNT
+struct statmount {
+ __u32 size; /* Total size, including strings */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
+ __u64 mask; /* What results were written */
+ __u32 sb_dev_major; /* Device ID */
+ __u32 sb_dev_minor;
+ __u64 sb_magic; /* ..._SUPER_MAGIC */
+ __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+ __u32 fs_type; /* [str] Filesystem type */
+ __u64 mnt_id; /* Unique ID of mount */
+ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
+ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
+ __u32 mnt_parent_id_old;
+ __u64 mnt_attr; /* MOUNT_ATTR_... */
+ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+ __u64 mnt_peer_group; /* ID of shared peer group */
+ __u64 mnt_master; /* Mount receives propagation from this ID */
+ __u64 propagate_from; /* Propagation from in current namespace */
+ __u32 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u32 mnt_point; /* [str] Mountpoint relative to current root */
+ __u64 mnt_ns_id; /* ID of the mount namespace */
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u64 supported_mask; /* Mask flags that this kernel supports */
+ __u64 __spare2[45];
+ char str[]; /* Variable size part containing strings */
+};
+#endif
+
+/* all the new flags added since the beginning of statmount */
+
+#ifndef STATMOUNT_MNT_NS_ID
+#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
+#endif
+
+#ifndef STATMOUNT_MNT_OPTS
+#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#endif
+
+#ifndef STATMOUNT_FS_SUBTYPE
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#endif
+
+#ifndef STATMOUNT_SB_SOURCE
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#endif
+
+#ifndef STATMOUNT_OPT_ARRAY
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#endif
+
+#ifndef STATMOUNT_OPT_SEC_ARRAY
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
+#endif
+
+#ifndef STATMOUNT_SUPPORTED_MASK
+#define STATMOUNT_SUPPORTED_MASK 0x00001000U /* Want/got supported mask flags */
+#endif
+
+/* flag bits for statmount */
+#ifndef STATMOUNT_BY_FD
+#define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */
+#endif
+
+#define LISTMOUNT_INIT_CURSOR (0ULL)
+
+int libfrog_listmount(uint64_t mnt_id, int mnt_ns_fd, uint64_t *cursor,
+ uint64_t *mnt_ids, size_t nr_mnt_ids);
+
+int libfrog_statmount(uint64_t mnt_id, int mnt_ns_fd, uint64_t statmount_flags,
+ struct statmount *smbuf, size_t smbuf_size);
+
+int libfrog_fstatmount(int fd, uint64_t statmount_flags,
+ struct statmount *smbuf, size_t smbuf_size);
+
+static inline size_t libfrog_statmount_sizeof(size_t strings_bytes)
+{
+ return sizeof(struct statmount) + strings_bytes;
+}
+
+#endif /* __LIBFROG_STATMOUNT_H__ */
diff --git a/configure.ac b/configure.ac
index a9febabc71cfc7..cffcaf373cfa5e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -183,6 +183,11 @@ AC_HAVE_BLKID_TOPO
AC_HAVE_TRIVIAL_AUTO_VAR_INIT
AC_STRERROR_R_RETURNS_STRING
AC_HAVE_CLOSE_RANGE
+AC_HAVE_LISTMOUNT
+if test "$have_listmount" = "yes"; then
+ AC_HAVE_LISTMOUNT_NS_FD
+ AC_HAVE_STATMOUNT_SUPPORTED_MASK
+fi
if test "$enable_ubsan" = "yes" || test "$enable_ubsan" = "probe"; then
AC_PACKAGE_CHECK_UBSAN
diff --git a/include/builddefs.in b/include/builddefs.in
index 4a2cb757c0bdb3..d2d25c8a0ed676 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -119,6 +119,10 @@ UDEV_RULE_DIR = @udev_rule_dir@
HAVE_LIBURCU_ATOMIC64 = @have_liburcu_atomic64@
STRERROR_R_RETURNS_STRING = @strerror_r_returns_string@
HAVE_CLOSE_RANGE = @have_close_range@
+HAVE_LISTMOUNT = @have_listmount@
+HAVE_LISTMOUNT_NS_FD = @have_listmount_ns_fd@
+HAVE_STATMOUNT_SUPPORTED_MASK = @have_statmount_supported_mask@
+NEED_INTERNAL_STATMOUNT = @need_internal_statmount@
GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
# -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
@@ -141,6 +145,9 @@ endif
ifeq ($(NEED_INTERNAL_STATX),yes)
PCFLAGS+= -DOVERRIDE_SYSTEM_STATX
endif
+ifeq ($(NEED_INTERNAL_STATMOUNT),yes)
+PCFLAGS+= -DOVERRIDE_SYSTEM_STATMOUNT
+endif
ifeq ($(HAVE_GETFSMAP),yes)
PCFLAGS+= -DHAVE_GETFSMAP
endif
diff --git a/libfrog/Makefile b/libfrog/Makefile
index 89a0332ae85372..22668212f22b93 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -96,6 +96,15 @@ ifeq ($(HAVE_CLOSE_RANGE),yes)
CFLAGS += -DHAVE_CLOSE_RANGE
endif
+ifeq ($(HAVE_LISTMOUNT),yes)
+CFILES += statmount.c
+HFILES += statmount.h
+endif
+
+ifeq ($(HAVE_LISTMOUNT_NS_FD),yes)
+CFLAGS+=-DHAVE_LISTMOUNT_NS_FD
+endif
+
default: ltdepend $(LTLIBRARY) $(GETTEXT_PY)
crc32table.h: gen_crc32table.c crc32defs.h
diff --git a/libfrog/statmount.c b/libfrog/statmount.c
new file mode 100644
index 00000000000000..edf17d6080ea42
--- /dev/null
+++ b/libfrog/statmount.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include <libfrog/statmount.h>
+
+int
+libfrog_listmount(
+ uint64_t mnt_id,
+ int mnt_ns_fd,
+ uint64_t *cursor,
+ uint64_t *mnt_ids,
+ size_t nr_mnt_ids)
+{
+ struct mnt_id_req req = {
+ .size = sizeof(req),
+ .mnt_id = mnt_id,
+#ifdef HAVE_LISTMOUNT_NS_FD
+ .mnt_ns_fd = mnt_ns_fd,
+#else
+ .spare = mnt_ns_fd,
+#endif
+ .param = *cursor,
+ };
+ int ret = syscall(SYS_listmount, &req, mnt_ids, nr_mnt_ids, 0);
+
+ if (ret > 0)
+ *cursor = mnt_ids[ret - 1];
+
+ return ret;
+}
+
+int
+libfrog_statmount(
+ uint64_t mnt_id,
+ int mnt_ns_fd,
+ uint64_t statmount_flags,
+ struct statmount *smbuf,
+ size_t smbuf_size)
+{
+ struct mnt_id_req req = {
+ .size = sizeof(req),
+ .mnt_id = mnt_id,
+#ifdef HAVE_LISTMOUNT_NS_FD
+ .mnt_ns_fd = mnt_ns_fd,
+#else
+ .spare = mnt_ns_fd,
+#endif
+ .param = statmount_flags,
+ };
+
+ return syscall(SYS_statmount, &req, smbuf, smbuf_size, 0);
+}
+
+int
+libfrog_fstatmount(
+ int fd,
+ uint64_t statmount_flags,
+ struct statmount *smbuf,
+ size_t smbuf_size)
+{
+ struct mnt_id_req req = {
+ .size = sizeof(req),
+#ifdef HAVE_LISTMOUNT_NS_FD
+ .mnt_ns_fd = fd,
+#else
+ .spare = fd,
+#endif
+ .param = statmount_flags,
+ };
+
+ return syscall(SYS_statmount, &req, smbuf, smbuf_size, STATMOUNT_BY_FD);
+}
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index b3d87229d3367a..ec4a3ef444b705 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -366,3 +366,89 @@ close_range(0, 0, 0);
AC_MSG_RESULT(no))
AC_SUBST(have_close_range)
])
+
+#
+# Check if listmount and statmount exist. Note that statmount came first (6.8)
+# and listmount came later (6.9), so we'll refuse both if either is missing.
+#
+AC_DEFUN([AC_HAVE_LISTMOUNT],
+ [AC_MSG_CHECKING([for listmount])
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+#include <alloca.h>
+ ]], [[
+ struct mnt_id_req req = {
+ .size = sizeof(req),
+ };
+ struct statmount smbuf;
+
+ syscall(SYS_statmount, &req, &smbuf, 0, 0);
+ syscall(SYS_listmount, &req, NULL, 0, 0);
+ ]])
+ ], have_listmount=yes
+ AC_MSG_RESULT(yes),
+ AC_MSG_RESULT(no))
+ AC_SUBST(have_listmount)
+ ])
+
+#
+# Check if mnt_id_req::mnt_ns_fd exists. This replaced mnt_id_req::spare in
+# 6.18, though earlier kernels allowed userspace to assign to spare.
+#
+AC_DEFUN([AC_HAVE_LISTMOUNT_NS_FD],
+ [AC_MSG_CHECKING([for struct mnt_id_req::mnt_ns_fd])
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+#include <alloca.h>
+ ]], [[
+ struct mnt_id_req req = {
+ .mnt_ns_fd = 555,
+ };
+
+ syscall(SYS_listmount, &req, NULL, 0, 0);
+ ]])
+ ], have_listmount_ns_fd=yes
+ AC_MSG_RESULT(yes),
+ AC_MSG_RESULT(no))
+ AC_SUBST(have_listmount_ns_fd)
+ ])
+
+#
+# Check if statmount::supported_mask (and hence sb_source) exists. We need
+# sb_source for xfs_healer_start; and supported_mask for the xfs_io wrapper.
+# sb_source was added in 6.13 and supported_mask in 6.15.
+#
+AC_DEFUN([AC_HAVE_STATMOUNT_SUPPORTED_MASK],
+ [AC_MSG_CHECKING([for struct statmount::supported_mask])
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+#include <alloca.h>
+ ]], [[
+ struct mnt_id_req req = {
+ .mnt_ns_fd = 555,
+ };
+ struct statmount smbuf = {
+ .supported_mask = 1,
+ };
+
+ syscall(SYS_statmount, &req, &smbuf, 0, 0);
+ ]])
+ ], have_statmount_supported_mask=yes
+ AC_MSG_RESULT(yes),
+ need_internal_statmount=yes
+ AC_MSG_RESULT(no))
+ AC_SUBST(have_statmount_supported_mask)
+ AC_SUBST(need_internal_statmount)
+ ])
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 06/26] man2: document the healthmon ioctl
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (4 preceding siblings ...)
2026-03-19 4:40 ` [PATCH 05/26] libfrog: add wrappers for listmount and statmount Darrick J. Wong
@ 2026-03-19 4:40 ` Darrick J. Wong
2026-03-19 4:40 ` [PATCH 07/26] man2: document the media verification ioctl Darrick J. Wong
` (19 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:40 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Document the XFS_IOC_HEALTH_MONITOR and
XFS_IOC_HEALTH_FD_ON_MONITORED_FS ioctls.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
man/man2/ioctl_xfs_health_fd_on_monitored_fs.2 | 75 ++++
man/man2/ioctl_xfs_health_monitor.2 | 464 ++++++++++++++++++++++++
2 files changed, 539 insertions(+)
create mode 100644 man/man2/ioctl_xfs_health_fd_on_monitored_fs.2
create mode 100644 man/man2/ioctl_xfs_health_monitor.2
diff --git a/man/man2/ioctl_xfs_health_fd_on_monitored_fs.2 b/man/man2/ioctl_xfs_health_fd_on_monitored_fs.2
new file mode 100644
index 00000000000000..bbc5ce9bbabf53
--- /dev/null
+++ b/man/man2/ioctl_xfs_health_fd_on_monitored_fs.2
@@ -0,0 +1,75 @@
+.\" Copyright (c) 2025-2026, Oracle. All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" SPDX-License-Identifier: GPL-2.0+
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-HEALTH-FD-ON-MONITORED-FS 2 2026-01-04 "XFS"
+.SH NAME
+ioctl_xfs_health_fd_on_monitored_fs \- check if the given fd belongs to the same fs being monitored
+.SH SYNOPSIS
+.br
+.B #include <xfs/xfs_fs.h>
+.PP
+.BI "int ioctl(int " healthmon_fd ", XFS_IOC_HEALTH_FD_ON_MONITORED_FS, struct xfs_health_file_on_monitored_fs *" arg );
+.SH DESCRIPTION
+This XFS healthmon fd ioctl asks the kernel driver if the file descriptor
+passed in via
+.I arg
+points to a file on the same filesystem that is being monitored by
+.IR healthmon_fd .
+The file descriptor is conveyed in a structure of the following form:
+.PP
+.in +4n
+.nf
+struct xfs_health_file_on_monitored_fs {
+ __s32 fd;
+ __u32 flags;
+};
+.fi
+.in
+.PP
+The field
+.I flags
+must be zero.
+.PP
+The field
+.I fd
+is a descriptor of an open file.
+.PP
+The argument
+.I healthmon_fd
+must be a file opened via the
+.B XFS_IOC_HEALTH_MONITOR
+ioctl.
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+If the file descriptor points to a file on the same filesystem that is being
+monitored, 0 is returned.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B ESTALE
+The open file is not on the same filesystem that is being monitored.
+.TP
+.B EINVAL
+One or more of the arguments specified is invalid.
+.TP
+.B EBADF
+.I arg.fd
+does not refer to an open file.
+.TP
+.B EFAULT
+The
+.I arg
+structure could not be copied into the kernel.
+.TP
+.B ENOTTY
+.I healthmon_fd
+is not a XFS health monitoring file.
+.SH CONFORMING TO
+This API is specific to XFS filesystem on the Linux kernel.
+.SH SEE ALSO
+.BR ioctl_xfs_health_monitor (2)
diff --git a/man/man2/ioctl_xfs_health_monitor.2 b/man/man2/ioctl_xfs_health_monitor.2
new file mode 100644
index 00000000000000..269c434515d960
--- /dev/null
+++ b/man/man2/ioctl_xfs_health_monitor.2
@@ -0,0 +1,464 @@
+.\" Copyright (c) 2025-2026, Oracle. All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" SPDX-License-Identifier: GPL-2.0+
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-HEALTH-MONITOR 2 2026-01-04 "XFS"
+.SH NAME
+ioctl_xfs_health_monitor \- read filesystem health events from the kernel
+.SH SYNOPSIS
+.br
+.B #include <xfs/xfs_fs.h>
+.PP
+.BI "int ioctl(int " dest_fd ", XFS_IOC_HEALTH_MONITOR, struct xfs_health_monitor *" arg );
+.SH DESCRIPTION
+This XFS ioctl asks the kernel driver to create a pseudo-file from which
+information about adverse filesystem health events can be read.
+This new file will be installed into the file descriptor table of the calling
+process as a read-only file, and will have the close-on-exec flag set.
+.PP
+The specific behaviors of this health monitor file are requested via a
+structure of the following form:
+.PP
+.in +4n
+.nf
+struct xfs_health_monitor {
+ __u64 flags;
+ __u8 format;
+ __u8 pad[23];
+};
+.fi
+.in
+.PP
+The field
+.I pad
+must be zero.
+.PP
+The field
+.I format
+controls the format of the event data that can be read:
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_FMT_V0
+Event data will be presented in discrete objects of type struct
+xfs_health_monitor_event.
+See below for more information.
+.RE
+
+.PD 1
+.PP
+The field
+.I flags
+control the behavior of the monitor.
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_VERBOSE
+Return all health events, including affirmations of healthy metadata.
+.RE
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+Otherwise, the return value is a new file descriptor.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EEXIST
+Health monitoring is already active for this filesystem.
+.TP
+.B EPERM
+The caller does not have permission to open a health monitor.
+Calling programs must have administrative capability, run in the initial user
+namespace, and the
+.I fd
+passed to ioctl must be the root directory of an XFS filesystem.
+.TP
+.B EINVAL
+One or more of the arguments specified is invalid.
+.TP
+.B EFAULT
+The argument could not be copied into the kernel.
+.TP
+.B ENOMEM
+There was not sufficient memory to construct the health monitor.
+.SH EVENT FORMAT
+Calling programs retrieve XFS health events by calling
+.BR read (2)
+on the returned file descriptor.
+The read buffer must be large enough to hold at least one event object.
+Partial objects will not be returned; instead, a short read will occur.
+
+Events will be returned in the following format:
+
+.PP
+.in +4n
+.nf
+struct xfs_health_monitor_event {
+ __u32 domain;
+ __u32 type;
+ __u64 time_ns;
+
+ union {
+ struct xfs_health_monitor_lost lost;
+ struct xfs_health_monitor_fs fs;
+ struct xfs_health_monitor_group group;
+ struct xfs_health_monitor_inode inode;
+ struct xfs_health_monitor_shutdown shutdown;
+ struct xfs_health_monitor_media media;
+ struct xfs_health_monitor_filerange filerange;
+ } e;
+
+ __u64 pad[2];
+};
+.fi
+.in
+.PP
+The field
+.I time_ns
+records the timestamp at which the health event was generated, in units of
+nanoseconds since the Unix epoch.
+.PP
+The field
+.I pad
+will be zero.
+.PP
+The field
+.I domain
+indicates the scope of the filesystem affected by the event:
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_MOUNT
+The entire filesystem is affected.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_FS
+Metadata concerning the entire filesystem is affected.
+Details are available through the
+.I fs
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_AG
+Metadata concerning a specific allocation group is affected.
+Details are available through the
+.I group
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_RTGROUP
+Metadata concerning a specific realtime allocation group is affected.
+Details are available through the
+.I group
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_INODE
+File metadata is affected.
+Details are available through the
+.I inode
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_DATADEV
+The main data volume is affected.
+Details are available through the
+.I media
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_RTDEV
+The realtime volume is affected.
+Details are available through the
+.I media
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_LOGDEV
+The external log is affected.
+Details are available through the
+.I media
+field.
+.TP
+.B XFS_HEALTH_MONITOR_DOMAIN_FILERANGE
+File data is affected.
+Details are available through the
+.I filerange
+field.
+.RE
+
+.PP
+The field
+.I type
+indicates what was affected by a health event:
+.RS 0.4i
+.PP
+The following types apply to events from the
+.B MOUNT
+domain.
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_RUNNING
+This filesystem health monitor is now running.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_LOST
+Health events were lost.
+Details are available through the
+.I lost
+field.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_UNMOUNT
+The filesystem is being unmounted.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_SHUTDOWN
+The filesystem has shut down due to problems.
+Details are available through the
+.I shutdown
+field.
+.RE
+.PP
+The following three types apply to events from the
+.BR FS ,
+.BR AG ,
+.BR RTGROUP ,
+and
+.B INODE
+domains.
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_SICK
+Filesystem metadata has been scanned by online fsck and found to be corrupt.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_CORRUPT
+A metadata corruption problem was encountered during a filesystem operation
+outside of fsck.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_HEALTHY
+Filesystem metadata has either been scanned by online fsck and found to be
+in good condition, or it has been repaired to good condition.
+.RE
+.PP
+The following type applies to events from the
+.BR DATADEV ,
+.BR RTDEV ,
+and
+.B LOGDEV
+domains.
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR
+A media error has been observed on one of the storage devices that can be
+attached to an XFS filesystem.
+.RE
+.PP
+The following types apply to events from the
+.B FILERANGE
+domain.
+.RS 0.4i
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_BUFREAD
+An attempt to read (or readahead) from a file failed with an I/O error.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_BUFWRITE
+An attempt to write dirty data to storage failed with an I/O error.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_DIOREAD
+A direct read of file data from storage failed with an I/O error.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_DIOWRITE
+A direct write of file data to storage failed with an I/O error.
+.TP
+.B XFS_HEALTH_MONITOR_TYPE_DATALOST
+A latent media error was discovered on the storage backing part of this file.
+.RE
+.RE
+
+.PP
+The union
+.I e
+contains further details about the health event:
+
+.RS 0.4i
+.PP
+The kernel will use no more than 32KiB of memory per monitoring file to queue
+health events.
+If this limit is exceeded, an event will be generated to describe how many
+events were lost:
+
+.in +4n
+.nf
+struct xfs_health_monitor_lost {
+ __u64 count;
+};
+.fi
+.in
+.PP
+The
+.I count
+field records the number of events lost.
+
+.PP
+If whole-filesystem metadata experiences a health event, the exact type of
+that metadata is recorded as follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_fs {
+ __u32 mask;
+};
+.fi
+.in
+.PP
+The
+.I mask
+field will contain
+.I XFS_FSOP_GEOM_SICK_*
+flags that are documented in the
+.BR ioctl_xfs_fsgeometry (2)
+manual page.
+
+.PP
+If an allocation group (realtime or data) experiences a health event,
+the exact type and location of the metadata is recorded as follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_group {
+ __u32 mask;
+ __u32 gno;
+};
+.fi
+.in
+.PP
+The
+.I mask
+field will contain
+.I XFS_AG_SICK_*
+flags that are documented in the
+.BR ioctl_xfs_ag_geometry (2)
+manual page, or the
+.I XFS_RTGROUP_SICK_*
+flags that are documented by the
+.BR ioctl_xfs_rtgroup_geometry (2)
+manual page.
+.PP
+The
+.I gno
+field will contain the group number.
+
+.PP
+If a file experiences a health event, the exact type and handle to the file
+is recorded as follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_inode {
+ __u32 mask;
+ __u32 gen;
+ __u64 ino;
+};
+.fi
+.in
+.PP
+The
+.I mask
+field will contain
+.I XFS_BS_SICK_*
+flags that are documented by the
+.BR ioctl_xfs_bulkstat (2)
+manual page.
+.PP
+The
+.I ino
+and
+.I gen
+fields describe a handle to the affected file.
+
+.PP
+If the filesystem shuts down abnormally, the exact reasons are recorded as
+follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_shutdown {
+ __u32 reasons;
+};
+.fi
+.in
+.PP
+The
+.I reasons
+field is a combination of the following values:
+.RS 0.4i
+.TP
+.B XFS_HEALTH_SHUTDOWN_META_IO_ERROR
+Metadata I/O errors were encountered.
+.TP
+.B XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR
+Log I/O errors were encountered.
+.TP
+.B XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT
+The filesystem was forcibly shut down by an administrator.
+.TP
+.B XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE
+In-memory metadata are corrupt.
+.TP
+.B XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK
+On-disk metadata are corrupt.
+.TP
+.B XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED
+Storage devices were removed.
+.RE
+
+.PP
+If a media error is discovered on the storage device, the exact location is
+recorded as follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_media {
+ __u64 daddr;
+ __u64 bbcount;
+};
+.fi
+.in
+.PP
+The
+.I daddr
+and
+.I bbcount
+fields describe the range of the storage that were lost.
+Both are provided in units of 512-byte blocks.
+
+.PP
+If a problem is discovered with regular file data, the handle of the file
+and the exact range of the file are recorded as follows:
+
+.in +4n
+.nf
+struct xfs_health_monitor_filerange {
+ __u64 pos;
+ __u64 len;
+ __u64 ino;
+ __u32 gen;
+ __u32 error;
+};
+.fi
+.in
+.PP
+The
+.I ino
+and
+.I gen
+fields describe a handle to the affected file.
+The
+.I pos
+and
+.I len
+fields describe the range of the file data that are affected.
+Both are provided in units of bytes.
+.PP
+The
+.I error
+field describes the error that occurred.
+See the
+.BR errno (3)
+manual page for more information.
+.RE
+.SH CONFORMING TO
+This API is specific to XFS filesystem on the Linux kernel.
+.SH SEE ALSO
+.BR ioctl_xfs_health_samefs (2)
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 07/26] man2: document the media verification ioctl
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (5 preceding siblings ...)
2026-03-19 4:40 ` [PATCH 06/26] man2: document the healthmon ioctl Darrick J. Wong
@ 2026-03-19 4:40 ` Darrick J. Wong
2026-03-19 4:40 ` [PATCH 08/26] xfs_io: monitor filesystem health events Darrick J. Wong
` (18 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:40 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Document XFS_IOC_VERIFY_MEDIA, which is a new ioctl for xfs_scrub to
perform media scans on the disks underneath the filesystem. This will
enable media errors to be reported to xfs_healer and fsnotify.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
man/man2/ioctl_xfs_verify_media.2 | 185 +++++++++++++++++++++++++++++++++++++
1 file changed, 185 insertions(+)
create mode 100644 man/man2/ioctl_xfs_verify_media.2
diff --git a/man/man2/ioctl_xfs_verify_media.2 b/man/man2/ioctl_xfs_verify_media.2
new file mode 100644
index 00000000000000..bd0d4579f5a364
--- /dev/null
+++ b/man/man2/ioctl_xfs_verify_media.2
@@ -0,0 +1,185 @@
+.\" Copyright (c) 2025-2026, Oracle. All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" SPDX-License-Identifier: GPL-2.0+
+.\" %%%LICENSE_END
+.TH IOCTL-XFS-VERIFY-MEDIA 2 2026-01-09 "XFS"
+.SH NAME
+ioctl_xfs_verify_media \- verify the media of the devices backing XFS
+.SH SYNOPSIS
+.br
+.B #include <xfs/xfs_fs.h>
+.PP
+.BI "int ioctl(int " fd ", XFS_IOC_VERIFY_MEDIA, struct xfs_verify_media *" arg );
+.SH DESCRIPTION
+Verify the media of a storage device backing an XFS filesystem.
+If errors are found, report the error to the kernel so that it can generate
+health events for the health monitoring system and fsnotify.
+The verification request is conveyed in a structure of the following form:
+.PP
+.in +4n
+.nf
+struct xfs_verify_error {
+ __u32 me_dev;
+ __u32 me_flags;
+ __u64 me_start_daddr;
+ __u64 me_end_daddr;
+ __u32 me_ioerror;
+ __u32 me_pad;
+};
+.fi
+.in
+.PP
+The field
+.I me_pad
+must be zero.
+.PP
+The field
+.I me_ioerror
+will be set if the ioctl returns success.
+.PP
+The fields
+.I me_start_daddr
+and
+.I me_end_daddr
+are the range of the storage device to verify.
+Both values must be in units of 512-byte blocks.
+The
+.I me_start_daddr
+field is inclusive, and the
+.I me_end_daddr
+field is exclusive.
+If
+.I me_end_daddr
+is larger than the size of the device, the kernel will set it to the size of
+the device.
+
+If the system call returns success and any part of the storage device range was
+successfully verified, the
+.I me_start_daddr
+field will be updated to reflect the successful verification.
+If after this update the
+.I me_start_daddr
+is equal to
+.IR me_end_daddr ,
+then the entire range was verified successfully.
+
+If not, then a media error was encountered and the caller should generate a
+series of secondary calls to this ioctl with smaller ranges to discover the
+exact location and type of media error.
+The type of media error will be written to the
+.I me_ioerror
+field.
+
+.PP
+The field
+.I me_dev
+must be one of the following values:
+.RS 0.4i
+.TP
+.B XFS_DEV_DATA
+Verify the data device.
+.TP
+.B XFS_DEV_LOG
+Verify the external log device.
+.TP
+.B XFS_DEV_RT
+Verify the realtime device.
+.RE
+.PP
+The field
+.I me_flags
+is a bitmask of one of the following values:
+.RS 0.4i
+.TP
+.B XFS_VERIFY_MEDIA_REPORT
+Report all media errors to fsnotify.
+.RE
+
+The
+.IR me_max_io_size
+field, if nonzero, will be used as advice for the maximum size of the IO to
+send to the device.
+
+The
+.I me_rest_us
+field will cause the kernel to pause for this many microseconds between IO
+requests.
+
+.SH RETURN VALUE
+On runtime error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+If 0 is returned, then
+.I start_daddr
+or
+.I ioerror
+will be updated.
+.PP
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EPERM
+The calling process does not have sufficient privilege.
+.TP
+.B EINVAL
+One or more of the arguments specified is invalid.
+.TP
+.B EFAULT
+The
+.I arg
+structure could not be copied into the kernel.
+.TP
+.B ENODEV
+The device is not present.
+.TP
+.B ENOMEM
+There was not enough memory to perform the verification.
+
+.SH I/O ERRORS
+The
+.I ioerror
+field could be set to one of the following:
+.TP
+.B 0
+The verification I/O succeeded.
+.TP
+.B EOPNOTSUPP
+.TP
+.B ETIMEDOUT
+The kernel timed out the verification I/O command.
+.TP
+.B ENOLINK
+The transportation link to the storage device was down temporarily.
+.TP
+.B EREMOTEIO
+The storage target controller suffered a critical error.
+.TP
+.B ENODATA
+The storage target media suffered a critical error.
+.TP
+.B EILSEQ
+Storage protection metadata did not validate successfully.
+.TP
+.B ENOMEM
+There was not enough memory to allocate an I/O request.
+.TP
+.B ENODEV
+The storage device is offline.
+.TP
+.B ETIME
+The storage device timed out the I/O command.
+.TP
+.B EINVAL
+The I/O request was rejected by the device for being invalid.
+.TP
+.B EIO
+An I/O error occurred but no specific details are available.
+.RE
+.PP
+This list is not exhaustive and may grow in the future.
+
+.SH CONFORMING TO
+This API is specific to XFS filesystem on the Linux kernel.
+.SH SEE ALSO
+.BR ioctl_xfs_health_monitor (2)
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 08/26] xfs_io: monitor filesystem health events
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (6 preceding siblings ...)
2026-03-19 4:40 ` [PATCH 07/26] man2: document the media verification ioctl Darrick J. Wong
@ 2026-03-19 4:40 ` Darrick J. Wong
2026-03-19 4:41 ` [PATCH 09/26] xfs_io: add a media verify command Darrick J. Wong
` (17 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:40 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a subcommand to monitor for health events generated by the kernel.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
io/io.h | 1
io/Makefile | 1
io/healthmon.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++
io/init.c | 1
man/man8/xfs_io.8 | 25 +++++++
5 files changed, 214 insertions(+)
create mode 100644 io/healthmon.c
diff --git a/io/io.h b/io/io.h
index 35fb8339eeb5aa..2f5262bce6acbb 100644
--- a/io/io.h
+++ b/io/io.h
@@ -162,3 +162,4 @@ extern void bulkstat_init(void);
void exchangerange_init(void);
void fsprops_init(void);
void aginfo_init(void);
+void healthmon_init(void);
diff --git a/io/Makefile b/io/Makefile
index 444e2d6a557d5d..8e3783353a52b5 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -25,6 +25,7 @@ CFILES = \
fsuuid.c \
fsync.c \
getrusage.c \
+ healthmon.c \
imap.c \
init.c \
inject.c \
diff --git a/io/healthmon.c b/io/healthmon.c
new file mode 100644
index 00000000000000..5bf54ff6c717e6
--- /dev/null
+++ b/io/healthmon.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/paths.h"
+#include "libfrog/healthevent.h"
+#include "command.h"
+#include "init.h"
+#include "io.h"
+
+static void
+healthmon_help(void)
+{
+ printf(_(
+"Monitor filesystem health events"
+"\n"
+"-c Replace the open file with the monitor file.\n"
+"-d delay_ms Sleep this many milliseconds between reads.\n"
+"-p Only probe for the existence of the ioctl.\n"
+"-v Request all events.\n"
+"\n"));
+}
+
+static inline int
+monitor_sleep(
+ int delay_ms)
+{
+ struct timespec ts;
+
+ if (!delay_ms)
+ return 0;
+
+ ts.tv_sec = delay_ms / 1000;
+ ts.tv_nsec = (delay_ms % 1000) * 1000000;
+
+ return nanosleep(&ts, NULL);
+}
+
+static int
+monitor(
+ size_t bufsize,
+ bool consume,
+ int delay_ms,
+ bool verbose,
+ bool only_probe)
+{
+ struct xfs_health_monitor hmo = {
+ .format = XFS_HEALTH_MONITOR_FMT_V0,
+ };
+ struct hme_prefix pfx;
+ void *buf;
+ ssize_t bytes_read;
+ int mon_fd;
+ int ret = 1;
+
+ hme_prefix_init(&pfx, file->name);
+
+ if (verbose)
+ hmo.flags |= XFS_HEALTH_MONITOR_ALL;
+
+ mon_fd = ioctl(file->fd, XFS_IOC_HEALTH_MONITOR, &hmo);
+ if (mon_fd < 0) {
+ perror("XFS_IOC_HEALTH_MONITOR");
+ return 1;
+ }
+
+ if (only_probe) {
+ ret = 0;
+ goto out_mon;
+ }
+
+ buf = malloc(bufsize);
+ if (!buf) {
+ perror("malloc");
+ goto out_mon;
+ }
+
+ if (consume) {
+ close(file->fd);
+ file->fd = mon_fd;
+ }
+
+ monitor_sleep(delay_ms);
+ while ((bytes_read = read(mon_fd, buf, bufsize)) > 0) {
+ struct xfs_health_monitor_event *hme = buf;
+
+ while (bytes_read >= sizeof(*hme)) {
+ hme_report_event(&pfx, hme);
+ hme++;
+ bytes_read -= sizeof(*hme);
+ }
+ if (bytes_read > 0) {
+ printf("healthmon: %zu bytes remain?\n", bytes_read);
+ fflush(stdout);
+ }
+
+ monitor_sleep(delay_ms);
+ }
+ if (bytes_read < 0) {
+ perror("healthmon");
+ goto out_buf;
+ }
+
+ ret = 0;
+
+out_buf:
+ free(buf);
+out_mon:
+ close(mon_fd);
+ return ret;
+}
+
+static int
+healthmon_f(
+ int argc,
+ char **argv)
+{
+ size_t bufsize = 4096;
+ bool consume = false;
+ bool verbose = false;
+ bool only_probe = false;
+ int delay_ms = 0;
+ int c;
+
+ while ((c = getopt(argc, argv, "b:cd:pv")) != EOF) {
+ switch (c) {
+ case 'b':
+ errno = 0;
+ c = atoi(optarg);
+ if (c < 0 || errno) {
+ printf("%s: bufsize must be positive\n",
+ optarg);
+ exitcode = 1;
+ return 0;
+ }
+ bufsize = c;
+ break;
+ case 'c':
+ consume = true;
+ break;
+ case 'd':
+ errno = 0;
+ delay_ms = atoi(optarg);
+ if (delay_ms < 0 || errno) {
+ printf("%s: delay must be positive msecs\n",
+ optarg);
+ exitcode = 1;
+ return 0;
+ }
+ break;
+ case 'p':
+ only_probe = true;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ exitcode = 1;
+ healthmon_help();
+ return 0;
+ }
+ }
+
+ return monitor(bufsize, consume, delay_ms, verbose, only_probe);
+}
+
+static struct cmdinfo healthmon_cmd = {
+ .name = "healthmon",
+ .cfunc = healthmon_f,
+ .argmin = 0,
+ .argmax = -1,
+ .flags = CMD_FLAG_ONESHOT | CMD_NOMAP_OK,
+ .args = "[-c] [-d delay_ms] [-v]",
+ .help = healthmon_help,
+};
+
+void
+healthmon_init(void)
+{
+ healthmon_cmd.oneline = _("monitor filesystem health events");
+
+ add_command(&healthmon_cmd);
+}
diff --git a/io/init.c b/io/init.c
index 49e9e7cb88214b..cb5573f45ccfbc 100644
--- a/io/init.c
+++ b/io/init.c
@@ -92,6 +92,7 @@ init_commands(void)
crc32cselftest_init();
exchangerange_init();
fsprops_init();
+ healthmon_init();
}
/*
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 0a673322fde3a1..f7f2956a54a7aa 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1356,6 +1356,31 @@ .SH FILESYSTEM COMMANDS
.B thaw
Undo the effects of a filesystem freeze operation.
Only available in expert mode and requires privileges.
+.TP
+.BI "healthmon [ \-c " bufsize " ] [ \-c ] [ \-d " delay_ms " ] [ \-p ] [ \-v ]"
+Watch for filesystem health events and write them to the console.
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.BI "\-b " bufsize
+Use a buffer of this size to read events from the kernel.
+.TP
+.BI \-c
+Close the open file and replace it with the monitor file.
+.TP
+.BI "\-d " delay_ms
+Sleep for this long between read attempts.
+.TP
+.B \-p
+Probe for the existence of the functionality by opening the monitoring fd and
+closing it immediately.
+.TP
+.BI \-v
+Request all health events, even if nothing changed.
+.PD
+.RE
+
.TP
.BI "inject [ " tag " ]"
Inject errors into a filesystem to observe filesystem behavior at
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 09/26] xfs_io: add a media verify command
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (7 preceding siblings ...)
2026-03-19 4:40 ` [PATCH 08/26] xfs_io: monitor filesystem health events Darrick J. Wong
@ 2026-03-19 4:41 ` Darrick J. Wong
2026-03-19 4:41 ` [PATCH 10/26] xfs_healer: create daemon to listen for health events Darrick J. Wong
` (16 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:41 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add a subcommand to invoke the media verification ioctl to make sure
that we can actually check the storage underneath an xfs filesystem.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
io/io.h | 1
io/Makefile | 3 +
io/init.c | 1
io/verify_media.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++
man/man8/xfs_io.8 | 42 ++++++++++++
5 files changed, 226 insertions(+), 1 deletion(-)
create mode 100644 io/verify_media.c
diff --git a/io/io.h b/io/io.h
index 2f5262bce6acbb..0f12b3cfed5e76 100644
--- a/io/io.h
+++ b/io/io.h
@@ -163,3 +163,4 @@ void exchangerange_init(void);
void fsprops_init(void);
void aginfo_init(void);
void healthmon_init(void);
+void verifymedia_init(void);
diff --git a/io/Makefile b/io/Makefile
index 8e3783353a52b5..79d5e172b8f31f 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -51,7 +51,8 @@ CFILES = \
sync.c \
sync_file_range.c \
truncate.c \
- utimes.c
+ utimes.c \
+ verify_media.c
LLDLIBS = $(LIBXCMD) $(LIBHANDLE) $(LIBFROG) $(LIBPTHREAD) $(LIBUUID)
LTDEPENDENCIES = $(LIBXCMD) $(LIBHANDLE) $(LIBFROG)
diff --git a/io/init.c b/io/init.c
index cb5573f45ccfbc..f2a551ef559200 100644
--- a/io/init.c
+++ b/io/init.c
@@ -93,6 +93,7 @@ init_commands(void)
exchangerange_init();
fsprops_init();
healthmon_init();
+ verifymedia_init();
}
/*
diff --git a/io/verify_media.c b/io/verify_media.c
new file mode 100644
index 00000000000000..e67567f675abfd
--- /dev/null
+++ b/io/verify_media.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "command.h"
+#include "input.h"
+#include "init.h"
+#include "io.h"
+
+static void
+verifymedia_help(void)
+{
+ printf(_(
+"\n"
+" Verify the media of the devices backing the filesystem.\n"
+"\n"
+" -d -- Verify the data device (default).\n"
+" -l -- Verify the log device.\n"
+" -r -- Verify the realtime device.\n"
+" -R -- Report media errors to fsnotify.\n"
+" -s -- Sleep this many usecs between IOs.\n"
+"\n"
+" start is the byte offset of the start of the range to verify. If the start\n"
+" is specified, the end may (optionally) be specified as well."
+"\n"
+" end is the byte offset of the end of the range to verify.\n"
+"\n"
+" If neither start nor end are specified, the media verification will\n"
+" check the entire device."
+"\n"));
+}
+
+static int
+verifymedia_f(
+ int argc,
+ char **argv)
+{
+ xfs_daddr_t orig_start_daddr = 0;
+ struct xfs_verify_media me = {
+ .me_start_daddr = orig_start_daddr,
+ .me_end_daddr = ~0ULL,
+ .me_dev = XFS_DEV_DATA,
+ };
+ struct timeval t1, t2;
+ long long l;
+ size_t fsblocksize, fssectsize;
+ const char *verifydev = _("datadev");
+ int c, ret;
+
+ init_cvtnum(&fsblocksize, &fssectsize);
+
+ while ((c = getopt(argc, argv, "b:dlrRs:")) != EOF) {
+ switch (c) {
+ case 'd':
+ me.me_dev = XFS_DEV_DATA;
+ verifydev = _("datadev");
+ break;
+ case 'l':
+ me.me_dev = XFS_DEV_LOG;
+ verifydev = _("logdev");
+ break;
+ case 'r':
+ me.me_dev = XFS_DEV_RT;
+ verifydev = _("rtdev");
+ break;
+ case 'b':
+ l = cvtnum(fsblocksize, fssectsize, optarg);
+ if (l < 0 || l > UINT_MAX) {
+ printf("non-numeric maxio argument -- %s\n",
+ optarg);
+ exitcode = 1;
+ return 0;
+ }
+ me.me_max_io_size = l;
+ break;
+ case 'R':
+ me.me_flags |= XFS_VERIFY_MEDIA_REPORT;
+ break;
+ case 's':
+ l = atoi(optarg);
+ if (l < 0) {
+ printf("non-numeric rest_us argument -- %s\n",
+ optarg);
+ exitcode = 1;
+ return 0;
+ }
+ me.me_rest_us = l;
+ break;
+ default:
+ verifymedia_help();
+ exitcode = 1;
+ return 0;
+ }
+ }
+
+ /* Range start (optional) */
+ if (optind < argc) {
+ l = cvtnum(fsblocksize, fssectsize, argv[optind]);
+ if (l < 0) {
+ printf("non-numeric start argument -- %s\n",
+ argv[optind]);
+ exitcode = 1;
+ return 0;
+ }
+
+ orig_start_daddr = l / 512;
+ me.me_start_daddr = orig_start_daddr;
+ optind++;
+ }
+
+ /* Range end (optional if range start was specified) */
+ if (optind < argc) {
+ l = cvtnum(fsblocksize, fssectsize, argv[optind]);
+ if (l < 0) {
+ printf("non-numeric end argument -- %s\n",
+ argv[optind]);
+ exitcode = 1;
+ return 0;
+ }
+
+ me.me_end_daddr = ((l + 511) / 512);
+ optind++;
+ }
+
+ if (optind < argc) {
+ printf("too many arguments -- %s\n", argv[optind]);
+ exitcode = 1;
+ return 0;
+ }
+
+ gettimeofday(&t1, NULL);
+ ret = ioctl(file->fd, XFS_IOC_VERIFY_MEDIA, &me);
+ gettimeofday(&t2, NULL);
+ t2 = tsub(t2, t1);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s: ioctl(XFS_IOC_VERIFY_MEDIA) [\"%s\"]: %s\n",
+ progname, file->name, strerror(errno));
+ exitcode = 1;
+ return 0;
+ }
+
+ if (me.me_ioerror) {
+ fprintf(stderr,
+ "%s: verify error at offset %llu length %llu: %s\n",
+ verifydev,
+ BBTOB(me.me_start_daddr),
+ BBTOB(me.me_end_daddr - me.me_start_daddr),
+ strerror(me.me_ioerror));
+ } else {
+ unsigned long long total;
+
+ if (me.me_end_daddr > orig_start_daddr)
+ total = BBTOB(me.me_end_daddr - orig_start_daddr);
+ else
+ total = 0;
+ report_io_times("verified", &t2, BBTOB(orig_start_daddr),
+ BBTOB(me.me_start_daddr - orig_start_daddr),
+ total, 1, false);
+ }
+
+ return 0;
+}
+
+static struct cmdinfo verifymedia_cmd = {
+ .name = "verifymedia",
+ .cfunc = verifymedia_f,
+ .argmin = 0,
+ .argmax = -1,
+ .flags = CMD_FLAG_ONESHOT | CMD_NOMAP_OK,
+ .args = "[-lr] [start [end]]",
+ .help = verifymedia_help,
+};
+
+void
+verifymedia_init(void)
+{
+ add_command(&verifymedia_cmd);
+}
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index f7f2956a54a7aa..2090cd4c0b2641 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1389,6 +1389,48 @@ .SH FILESYSTEM COMMANDS
argument, displays the list of error tags available.
Only available in expert mode and requires privileges.
+.TP
+.BI "verifymedia [ \-bdlrsR ] [ " start " [ " end " ]]"
+Check for media errors on the storage devices backing XFS.
+The
+.I start
+and
+.I end
+parameters are the range of physical storage to verify, in bytes.
+The
+.I start
+parameter is inclusive.
+The
+.I end
+parameter is exclusive.
+If neither
+.IR start " nor " end
+are specified, the entire device will be verified.
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.B \-b
+Don't issue any IOs larger than this size.
+.TP
+.B \-d
+Verify the data device.
+This is the default.
+.TP
+.B \-l
+Verify the log device instead of the data device.
+.TP
+.B \-r
+Verify the realtime device instead of the data device.
+.TP
+.B \-R
+Report media errors to fsnotify.
+.TP
+.B \-s
+Sleep this many microseconds between IO requests.
+.PD
+.RE
+
.TP
.BI "rginfo [ \-r " rgno " ]"
Show information about or update the state of realtime allocation groups.
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 10/26] xfs_healer: create daemon to listen for health events
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (8 preceding siblings ...)
2026-03-19 4:41 ` [PATCH 09/26] xfs_io: add a media verify command Darrick J. Wong
@ 2026-03-19 4:41 ` Darrick J. Wong
2026-03-19 4:41 ` [PATCH 11/26] xfs_healer: enable repairing filesystems Darrick J. Wong
` (15 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:41 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a daemon program that can listen for and log health events.
Eventually this will be used to self-heal filesystems in real time.
Because events can take a while to process, the main thread reads event
objects from the healthmon fd and dispatches them to a background
workqueue as quickly as it can. This split of responsibilities is
necessary because the kernel event queue will drop events if the queue
fills up, and each event can take some time to process (logging,
repairs, etc.) so we don't want to lose events.
To be clear, xfs_healer and xfs_scrub are complementary tools:
Scrub walks the whole filesystem, finds stuff that needs fixing or
rebuilding, and rebuilds it. This is sort of analogous to a patrol
scrub.
Healer listens for metadata corruption messages from the kernel and
issues a targeted repair of that structure. This is kind of like an
ondemand scrub.
My end goal is that xfs_healer (the service) is active all the time and
can respond instantly to a corruption report, whereas xfs_scrub (the
service) gets run periodically as a cron job.
xfs_healer can decide that it's overwhelmed with problems and start
xfs_scrub to deal with the mess. Ideally you don't crash the filesystem
and then have to use xfs_repair to smash your way back to a mountable
filesystem.
By default we run xfs_healer as a background service, which means that
we only start two threads -- one to read the events, and another to
process them. In other words, we try not to use all available hardware
resources for repairs. The foreground mode switch starts up a large
number of threads to try to increase parallelism, which may or may not
be useful for repairs depending on how much metadata the kernel needs to
scan.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 47 ++++++
Makefile | 5 +
configure.ac | 6 +
healer/Makefile | 35 ++++
healer/xfs_healer.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/builddefs.in | 1
6 files changed, 485 insertions(+)
create mode 100644 healer/xfs_healer.h
create mode 100644 healer/Makefile
create mode 100644 healer/xfs_healer.c
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
new file mode 100644
index 00000000000000..bcddde5db0cc47
--- /dev/null
+++ b/healer/xfs_healer.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HEALER_XFS_HEALER_H_
+#define XFS_HEALER_XFS_HEALER_H_
+
+extern char *progname;
+
+/*
+ * When running in environments with restrictive security policies, healer
+ * might not be allowed to access the global mount tree. However, processes
+ * are usually still allowed to see their own mount tree, so use this path for
+ * all mount table queries.
+ */
+#define _PATH_PROC_MOUNTS "/proc/self/mounts"
+
+struct healer_ctx {
+ /* CLI options, must be int */
+ int debug;
+ int log;
+ int everything;
+ int foreground;
+
+ /* fd and fs geometry for mount */
+ struct xfs_fd mnt;
+
+ /* Shared reference to the user's mountpoint for logging */
+ const char *mntpoint;
+
+ /* Shared reference to the getmntent fsname for reconnecting */
+ const char *fsname;
+
+ /* file stream of monitor and buffer */
+ FILE *mon_fp;
+ char *mon_buf;
+
+ /* coordinates logging printfs */
+ pthread_mutex_t conlock;
+
+ /* event queue */
+ struct workqueue event_queue;
+ bool queue_active;
+};
+
+#endif /* XFS_HEALER_XFS_HEALER_H_ */
diff --git a/Makefile b/Makefile
index c73aa391bc5f43..1f499c30f3457e 100644
--- a/Makefile
+++ b/Makefile
@@ -69,6 +69,10 @@ ifeq ("$(ENABLE_SCRUB)","yes")
TOOL_SUBDIRS += scrub
endif
+ifeq ("$(ENABLE_HEALER)","yes")
+TOOL_SUBDIRS += healer
+endif
+
ifneq ("$(XGETTEXT)","")
TOOL_SUBDIRS += po
endif
@@ -100,6 +104,7 @@ mkfs: libxcmd
spaceman: libxcmd libhandle
scrub: libhandle libxcmd
rtcp: libfrog
+healer: libhandle
ifeq ($(HAVE_BUILDDEFS), yes)
include $(BUILDRULES)
diff --git a/configure.ac b/configure.ac
index cffcaf373cfa5e..90af1f84035ee6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -110,6 +110,12 @@ AC_ARG_ENABLE(libicu,
[ --enable-libicu=[yes/no] Enable Unicode name scanning in xfs_scrub (libicu) [default=probe]],,
enable_libicu=probe)
+# Enable xfs_healer build
+AC_ARG_ENABLE(healer,
+[ --enable-healer=[yes/no] Enable build of xfs_healer utility [[default=yes]]],,
+ enable_healer=yes)
+AC_SUBST(enable_healer)
+
#
# If the user specified a libdir ending in lib64 do not append another
# 64 to the library names.
diff --git a/healer/Makefile b/healer/Makefile
new file mode 100644
index 00000000000000..e82c820883669a
--- /dev/null
+++ b/healer/Makefile
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2024-2026 Oracle. All Rights Reserved.
+#
+
+TOPDIR = ..
+builddefs=$(TOPDIR)/include/builddefs
+include $(builddefs)
+
+INSTALL_HEALER = install-healer
+
+LTCOMMAND = xfs_healer
+
+CFILES = \
+xfs_healer.c
+
+HFILES = \
+xfs_healer.h
+
+LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
+LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
+LLDFLAGS = -static
+
+default: depend $(LTCOMMAND)
+
+include $(BUILDRULES)
+
+install: $(INSTALL_HEALER)
+
+install-healer: default
+ $(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
+ $(INSTALL) -m 755 $(LTCOMMAND) $(PKG_LIBEXEC_DIR)
+
+install-dev:
+
+-include .dep
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
new file mode 100644
index 00000000000000..e0076fff381632
--- /dev/null
+++ b/healer/xfs_healer.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "platform_defs.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/paths.h"
+#include "libfrog/healthevent.h"
+#include "libfrog/workqueue.h"
+#include "libfrog/systemd.h"
+#include "xfs_healer.h"
+
+/* Program name; needed for libfrog error reports. */
+char *progname = "xfs_healer";
+
+/* Return a health monitoring fd. */
+static int
+open_health_monitor(
+ struct healer_ctx *ctx,
+ int mnt_fd)
+{
+ struct xfs_health_monitor hmo = {
+ .format = XFS_HEALTH_MONITOR_FMT_V0,
+ };
+
+ if (ctx->everything)
+ hmo.flags |= XFS_HEALTH_MONITOR_VERBOSE;
+
+ return ioctl(mnt_fd, XFS_IOC_HEALTH_MONITOR, &hmo);
+}
+
+/* Decide if this event can only be reported upon, and not acted upon. */
+static bool
+event_not_actionable(
+ const struct xfs_health_monitor_event *hme)
+{
+ switch (hme->type) {
+ case XFS_HEALTH_MONITOR_TYPE_LOST:
+ case XFS_HEALTH_MONITOR_TYPE_RUNNING:
+ case XFS_HEALTH_MONITOR_TYPE_UNMOUNT:
+ case XFS_HEALTH_MONITOR_TYPE_SHUTDOWN:
+ return true;
+ }
+
+ return false;
+}
+
+/* Should this event be logged? */
+static bool
+event_loggable(
+ const struct healer_ctx *ctx,
+ const struct xfs_health_monitor_event *hme)
+{
+ return ctx->log || event_not_actionable(hme);
+}
+
+/* Handle an event asynchronously. */
+static void
+handle_event(
+ struct workqueue *wq,
+ uint32_t index,
+ void *arg)
+{
+ struct hme_prefix pfx;
+ struct xfs_health_monitor_event *hme = arg;
+ struct healer_ctx *ctx = wq->wq_ctx;
+ const bool loggable = event_loggable(ctx, hme);
+
+ hme_prefix_init(&pfx, ctx->mntpoint);
+
+ /*
+ * Non-actionable events should always be logged, because they are 100%
+ * informational.
+ */
+ if (loggable) {
+ pthread_mutex_lock(&ctx->conlock);
+ hme_report_event(&pfx, hme);
+ pthread_mutex_unlock(&ctx->conlock);
+ }
+
+ free(hme);
+}
+
+/*
+ * Find the filesystem source name for the mount that we're monitoring. We
+ * don't use the fs_table_ helpers because we might be running in a restricted
+ * environment where we cannot access device files at all.
+ */
+static int
+try_capture_fsinfo(
+ struct healer_ctx *ctx)
+{
+ struct mntent *mnt;
+ FILE *mtp;
+ char rpath[PATH_MAX], rmnt_dir[PATH_MAX];
+
+ if (!realpath(ctx->mntpoint, rpath))
+ return -1;
+
+ mtp = setmntent(_PATH_PROC_MOUNTS, "r");
+ if (mtp == NULL)
+ return -1;
+
+ while ((mnt = getmntent(mtp)) != NULL) {
+ if (strcmp(mnt->mnt_type, "xfs"))
+ continue;
+ if (!realpath(mnt->mnt_dir, rmnt_dir))
+ continue;
+
+ if (!strcmp(rpath, rmnt_dir)) {
+ ctx->fsname = strdup(mnt->mnt_fsname);
+ break;
+ }
+ }
+
+ endmntent(mtp);
+
+ return ctx->fsname ? 0 : -1;
+}
+
+static unsigned int
+healer_nproc(
+ const struct healer_ctx *ctx)
+{
+ /*
+ * By default, use one event handler thread. In foreground mode,
+ * create one thread per cpu.
+ */
+ return ctx->foreground ? platform_nproc() : 1;
+}
+
+/* Set ourselves up to monitor the given mountpoint for health events. */
+static int
+setup_monitor(
+ struct healer_ctx *ctx)
+{
+ const long BUF_SIZE = sysconf(_SC_PAGE_SIZE) * 2;
+ int mon_fd;
+ int ret;
+
+ ret = xfd_open(&ctx->mnt, ctx->mntpoint, O_RDONLY);
+ if (ret) {
+ perror(ctx->mntpoint);
+ return -1;
+ }
+
+ ret = try_capture_fsinfo(ctx);
+ if (ret) {
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("Not a XFS mount point."));
+ goto out_mnt_fd;
+ }
+
+ /*
+ * Open the health monitor, then close the mountpoint to avoid pinning
+ * it. We can reconnect later if need be.
+ */
+ mon_fd = open_health_monitor(ctx, ctx->mnt.fd);
+ if (mon_fd < 0) {
+ switch (errno) {
+ case ENOTTY:
+ case EOPNOTSUPP:
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS health monitoring not supported."));
+ break;
+ case EEXIST:
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS health monitoring already running."));
+ break;
+ default:
+ perror(ctx->mntpoint);
+ break;
+ }
+
+ goto out_mnt_fd;
+ }
+ close(ctx->mnt.fd);
+ ctx->mnt.fd = -1;
+
+ /*
+ * mon_fp consumes mon_fd. We intentionally leave mon_fp attached to
+ * the context so that we keep the monitoring fd open until we've torn
+ * down all the background threads.
+ */
+ ctx->mon_fp = fdopen(mon_fd, "r");
+ if (!ctx->mon_fp) {
+ perror(ctx->mntpoint);
+ goto out_mon_fd;
+ }
+
+ /* Increase the buffer size so that we can reduce kernel calls */
+ ctx->mon_buf = malloc(BUF_SIZE);
+ if (ctx->mon_buf)
+ setvbuf(ctx->mon_fp, ctx->mon_buf, _IOFBF, BUF_SIZE);
+
+ /*
+ * Queue up to 1MB of events before we stop trying to read events from
+ * the kernel as quickly as we can. Note that the kernel won't accrue
+ * more than 32K of internal events before it starts dropping them.
+ */
+ ret = workqueue_create_bound(&ctx->event_queue, ctx, healer_nproc(ctx),
+ 1048576 / sizeof(struct xfs_health_monitor_event));
+ if (ret) {
+ errno = ret;
+ fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+ _("worker threadpool setup"), strerror(errno));
+ goto out_mon_fp;
+ }
+ ctx->queue_active = true;
+
+ return 0;
+
+out_mon_fp:
+ if (ctx->mon_fp)
+ fclose(ctx->mon_fp);
+ ctx->mon_fp = NULL;
+out_mon_fd:
+ if (mon_fd >= 0)
+ close(mon_fd);
+out_mnt_fd:
+ if (ctx->mnt.fd >= 0)
+ close(ctx->mnt.fd);
+ ctx->mnt.fd = -1;
+ return -1;
+}
+
+/* Monitor the given mountpoint for health events. */
+static void
+monitor(
+ struct healer_ctx *ctx)
+{
+ bool mounted = true;
+ size_t nr;
+
+ do {
+ struct xfs_health_monitor_event *hme;
+ int ret;
+
+ hme = malloc(sizeof(*hme));
+ if (!hme) {
+ pthread_mutex_lock(&ctx->conlock);
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("could not allocate event object"));
+ pthread_mutex_unlock(&ctx->conlock);
+ break;
+ }
+
+ nr = fread(hme, sizeof(*hme), 1, ctx->mon_fp);
+ if (nr == 0) {
+ free(hme);
+ break;
+ }
+
+ if (hme->type == XFS_HEALTH_MONITOR_TYPE_UNMOUNT)
+ mounted = false;
+
+ /* handle_event owns hme if the workqueue_add succeeds */
+ ret = workqueue_add(&ctx->event_queue, handle_event, 0, hme);
+ if (ret) {
+ pthread_mutex_lock(&ctx->conlock);
+ fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+ _("could not queue event object"),
+ strerror(ret));
+ pthread_mutex_unlock(&ctx->conlock);
+ free(hme);
+ break;
+ }
+ } while (nr > 0 && mounted);
+}
+
+/* Tear down all the resources that we created for monitoring */
+static void
+teardown_monitor(
+ struct healer_ctx *ctx)
+{
+ if (ctx->queue_active) {
+ workqueue_terminate(&ctx->event_queue);
+ workqueue_destroy(&ctx->event_queue);
+ }
+ if (ctx->mon_fp) {
+ fclose(ctx->mon_fp);
+ ctx->mon_fp = NULL;
+ }
+ free(ctx->mon_buf);
+ ctx->mon_buf = NULL;
+}
+
+static void __attribute__((noreturn))
+usage(void)
+{
+ fprintf(stderr, "%s %s %s\n", _("Usage:"), progname,
+ _("[OPTIONS] mountpoint"));
+ fprintf(stderr, "\n");
+ fprintf(stderr, _("Options:\n"));
+ fprintf(stderr, _(" --debug Enable debugging messages.\n"));
+ fprintf(stderr, _(" --everything Capture all events.\n"));
+ fprintf(stderr, _(" --foreground Process events as soon as possible.\n"));
+ fprintf(stderr, _(" --quiet Do not log health events to stdout.\n"));
+ fprintf(stderr, _(" -V Print version.\n"));
+
+ exit(EXIT_FAILURE);
+}
+
+enum long_opt_nr {
+ LOPT_DEBUG,
+ LOPT_EVERYTHING,
+ LOPT_FOREGROUND,
+ LOPT_HELP,
+ LOPT_QUIET,
+
+ LOPT_MAX,
+};
+
+int
+main(
+ int argc,
+ char **argv)
+{
+ struct healer_ctx ctx = {
+ .conlock = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
+ .log = 1,
+ .mnt.fd = -1,
+ };
+ int option_index;
+ int vflag = 0;
+ int c;
+ int ret;
+
+ progname = basename(argv[0]);
+ setlocale(LC_ALL, "");
+ bindtextdomain(PACKAGE, LOCALEDIR);
+ textdomain(PACKAGE);
+
+ struct option long_options[] = {
+ [LOPT_DEBUG] = {"debug", no_argument, &ctx.debug, 1 },
+ [LOPT_EVERYTHING] = {"everything", no_argument, &ctx.everything, 1 },
+ [LOPT_FOREGROUND] = {"foreground", no_argument, &ctx.foreground, 1 },
+ [LOPT_HELP] = {"help", no_argument, NULL, 0 },
+ [LOPT_QUIET] = {"quiet", no_argument, &ctx.log, 0 },
+
+ [LOPT_MAX] = {NULL, 0, NULL, 0 },
+ };
+
+ while ((c = getopt_long(argc, argv, "V", long_options, &option_index))
+ != EOF) {
+ switch (c) {
+ case 0:
+ switch (option_index) {
+ case LOPT_HELP:
+ usage();
+ break;
+ default:
+ break;
+ }
+ break;
+ case 'V':
+ vflag++;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (vflag) {
+ fprintf(stdout, "%s %s %s\n", progname, _("version"), VERSION);
+ fflush(stdout);
+ return EXIT_SUCCESS;
+ }
+
+ if (optind != argc - 1)
+ usage();
+
+ ctx.mntpoint = argv[optind];
+
+ ret = setup_monitor(&ctx);
+ if (ret)
+ goto out_events;
+
+ monitor(&ctx);
+
+out_events:
+ teardown_monitor(&ctx);
+ free((char *)ctx.fsname);
+ return systemd_service_exit(ret);
+}
diff --git a/include/builddefs.in b/include/builddefs.in
index d2d25c8a0ed676..0ab2bf1702f0f0 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -91,6 +91,7 @@ ENABLE_SHARED = @enable_shared@
ENABLE_GETTEXT = @enable_gettext@
ENABLE_EDITLINE = @enable_editline@
ENABLE_SCRUB = @enable_scrub@
+ENABLE_HEALER = @enable_healer@
HAVE_ZIPPED_MANPAGES = @have_zipped_manpages@
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 11/26] xfs_healer: enable repairing filesystems
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (9 preceding siblings ...)
2026-03-19 4:41 ` [PATCH 10/26] xfs_healer: create daemon to listen for health events Darrick J. Wong
@ 2026-03-19 4:41 ` Darrick J. Wong
2026-03-19 4:41 ` [PATCH 12/26] xfs_healer: use getparents to look up file names Darrick J. Wong
` (14 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:41 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make it so that our health monitoring daemon can initiate repairs in
response to reports of corrupt filesystem metadata. Repairs are
initiated from the background workers as explained in the previous
patch.
Note that just like xfs_scrub, xfs_healer's ability to repair metadata
relies heavily on back references such as reverse mappings and directory
parent pointers to add redundancy to the filesystem. Check for these
two features and whine a bit if they are missing, just like scrub.
There's a bit of trickery with the fd that is used to initiate repairs
in the kernel. Because an open fd will pin the filesystem in memory,
xfs_healer can only hold an open fd to the target filesystem while it's
performing repairs. Therefore, at startup xfs_healer must sample enough
information about the target filesystem to reconnect to it later on.
Currently, the fs source (aka the data device path) and the root
directory handle are sufficient to do this.
Someday we might be able to have revocable fds, which would eliminate
the need for such efforts in userspace.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 28 ++++++
libfrog/flagmap.h | 3 +
libfrog/healthevent.h | 12 ++
healer/Makefile | 2
healer/fsrepair.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++++
healer/weakhandle.c | 115 +++++++++++++++++++++++
healer/xfs_healer.c | 55 +++++++++++
libfrog/flagmap.c | 17 +++
libfrog/healthevent.c | 117 +++++++++++++++++++++++
9 files changed, 598 insertions(+)
create mode 100644 healer/fsrepair.c
create mode 100644 healer/weakhandle.c
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index bcddde5db0cc47..a4de1ad32a408f 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -8,6 +8,9 @@
extern char *progname;
+struct weakhandle;
+struct hme_prefix;
+
/*
* When running in environments with restrictive security policies, healer
* might not be allowed to access the global mount tree. However, processes
@@ -22,6 +25,7 @@ struct healer_ctx {
int log;
int everything;
int foreground;
+ int want_repair;
/* fd and fs geometry for mount */
struct xfs_fd mnt;
@@ -32,6 +36,9 @@ struct healer_ctx {
/* Shared reference to the getmntent fsname for reconnecting */
const char *fsname;
+ /* weak file handle so we can reattach to filesystem */
+ struct weakhandle *wh;
+
/* file stream of monitor and buffer */
FILE *mon_fp;
char *mon_buf;
@@ -44,4 +51,25 @@ struct healer_ctx {
bool queue_active;
};
+static inline bool healer_has_rmapbt(const struct healer_ctx *ctx)
+{
+ return ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT;
+}
+
+static inline bool healer_has_parent(const struct healer_ctx *ctx)
+{
+ return ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_PARENT;
+}
+
+/* repair.c */
+int repair_metadata(struct healer_ctx *ctx, const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme);
+bool healer_can_repair(struct healer_ctx *ctx);
+
+/* weakhandle.c */
+int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname,
+ struct weakhandle **whp);
+int weakhandle_reopen(struct weakhandle *wh, int *fd);
+void weakhandle_free(struct weakhandle **whp);
+
#endif /* XFS_HEALER_XFS_HEALER_H_ */
diff --git a/libfrog/flagmap.h b/libfrog/flagmap.h
index 8031d75a7c02a8..05110c3544dc97 100644
--- a/libfrog/flagmap.h
+++ b/libfrog/flagmap.h
@@ -14,6 +14,9 @@ struct flag_map {
void mask_to_string(const struct flag_map *map, unsigned long long mask,
const char *delimiter, char *buf, size_t bufsize);
+const char *lowest_set_mask_string(const struct flag_map *map,
+ unsigned long long mask);
+
const char *value_to_string(const struct flag_map *map,
unsigned long long value);
diff --git a/libfrog/healthevent.h b/libfrog/healthevent.h
index 6de41bc797100c..4f3c8ba639ec4c 100644
--- a/libfrog/healthevent.h
+++ b/libfrog/healthevent.h
@@ -40,4 +40,16 @@ hme_prefix_init(
void hme_report_event(const struct hme_prefix *pfx,
const struct xfs_health_monitor_event *hme);
+enum repair_outcome {
+ REPAIR_SUCCESS,
+ REPAIR_FAILED,
+ REPAIR_PROBABLY_OK,
+ REPAIR_UNNECESSARY,
+};
+
+void report_health_repair(const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t event_mask,
+ enum repair_outcome outcome);
+
#endif /* LIBFROG_HEALTHEVENT_H_ */
diff --git a/healer/Makefile b/healer/Makefile
index e82c820883669a..981192b81af626 100644
--- a/healer/Makefile
+++ b/healer/Makefile
@@ -11,6 +11,8 @@ INSTALL_HEALER = install-healer
LTCOMMAND = xfs_healer
CFILES = \
+fsrepair.c \
+weakhandle.c \
xfs_healer.c
HFILES = \
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
new file mode 100644
index 00000000000000..907afca3dba8a7
--- /dev/null
+++ b/healer/fsrepair.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include "platform_defs.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/workqueue.h"
+#include "libfrog/healthevent.h"
+#include "xfs_healer.h"
+
+/* Translate scrub output flags to outcome. */
+static enum repair_outcome from_repair_oflags(uint32_t oflags)
+{
+ if (oflags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_INCOMPLETE))
+ return REPAIR_FAILED;
+
+ if (oflags & XFS_SCRUB_OFLAG_XFAIL)
+ return REPAIR_PROBABLY_OK;
+
+ if (oflags & XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
+ return REPAIR_UNNECESSARY;
+
+ return REPAIR_SUCCESS;
+}
+
+struct u32_scrub {
+ uint32_t event_mask;
+ uint32_t scrub_type;
+};
+
+#define foreach_scrub_type(cur, mask, coll) \
+ for ((cur) = (coll); (cur)->scrub_type != 0; (cur)++) \
+ if ((mask) & (cur)->event_mask)
+
+/* Call the kernel to repair some inode metadata. */
+static inline enum repair_outcome
+xfs_repair_metadata(
+ int fd,
+ uint32_t scrub_type,
+ uint32_t group,
+ uint64_t ino,
+ uint32_t gen)
+{
+ struct xfs_scrub_metadata sm = {
+ .sm_type = scrub_type,
+ .sm_flags = XFS_SCRUB_IFLAG_REPAIR,
+ .sm_ino = ino,
+ .sm_gen = gen,
+ .sm_agno = group,
+ };
+ int ret;
+
+ ret = ioctl(fd, XFS_IOC_SCRUB_METADATA, &sm);
+ if (ret)
+ return REPAIR_FAILED;
+
+ return from_repair_oflags(sm.sm_flags);
+}
+
+/* React to a fs-domain corruption event by repairing it. */
+static void
+try_repair_wholefs(
+ struct healer_ctx *ctx,
+ const struct hme_prefix *pfx,
+ int mnt_fd,
+ const struct xfs_health_monitor_event *hme)
+{
+#define X(code, type) { XFS_FSOP_GEOM_SICK_ ## code, XFS_SCRUB_TYPE_ ## type }
+ static const struct u32_scrub FS_STRUCTURES[] = {
+ X(COUNTERS, FSCOUNTERS),
+ X(UQUOTA, UQUOTA),
+ X(GQUOTA, GQUOTA),
+ X(PQUOTA, PQUOTA),
+ X(RT_BITMAP, RTBITMAP),
+ X(RT_SUMMARY, RTSUM),
+ X(QUOTACHECK, QUOTACHECK),
+ X(NLINKS, NLINKS),
+ {0, 0},
+ };
+#undef X
+ const struct u32_scrub *f;
+
+ foreach_scrub_type(f, hme->e.fs.mask, FS_STRUCTURES) {
+ enum repair_outcome outcome =
+ xfs_repair_metadata(mnt_fd, f->scrub_type, 0, 0, 0);
+
+ pthread_mutex_lock(&ctx->conlock);
+ report_health_repair(pfx, hme, f->event_mask, outcome);
+ pthread_mutex_unlock(&ctx->conlock);
+ }
+}
+
+/* React to an ag corruption event by repairing it. */
+static void
+try_repair_ag(
+ struct healer_ctx *ctx,
+ const struct hme_prefix *pfx,
+ int mnt_fd,
+ const struct xfs_health_monitor_event *hme)
+{
+#define X(code, type) { XFS_AG_GEOM_SICK_ ## code, XFS_SCRUB_TYPE_ ## type }
+ static const struct u32_scrub AG_STRUCTURES[] = {
+ X(SB, SB),
+ X(AGF, AGF),
+ X(AGFL, AGFL),
+ X(AGI, AGI),
+ X(BNOBT, BNOBT),
+ X(CNTBT, CNTBT),
+ X(INOBT, INOBT),
+ X(FINOBT, FINOBT),
+ X(RMAPBT, RMAPBT),
+ X(REFCNTBT, REFCNTBT),
+ {0, 0},
+ };
+#undef X
+ const struct u32_scrub *f;
+
+ foreach_scrub_type(f, hme->e.group.mask, AG_STRUCTURES) {
+ enum repair_outcome outcome =
+ xfs_repair_metadata(mnt_fd, f->scrub_type,
+ hme->e.group.gno, 0, 0);
+
+ pthread_mutex_lock(&ctx->conlock);
+ report_health_repair(pfx, hme, f->event_mask, outcome);
+ pthread_mutex_unlock(&ctx->conlock);
+ }
+}
+
+/* React to a rtgroup corruption event by repairing it. */
+static void
+try_repair_rtgroup(
+ struct healer_ctx *ctx,
+ const struct hme_prefix *pfx,
+ int mnt_fd,
+ const struct xfs_health_monitor_event *hme)
+{
+#define X(code, type) { XFS_RTGROUP_GEOM_SICK_ ## code, XFS_SCRUB_TYPE_ ## type }
+ static const struct u32_scrub RTG_STRUCTURES[] = {
+ X(SUPER, RGSUPER),
+ X(BITMAP, RTBITMAP),
+ X(SUMMARY, RTSUM),
+ X(RMAPBT, RTRMAPBT),
+ X(REFCNTBT, RTREFCBT),
+ {0, 0},
+ };
+#undef X
+ const struct u32_scrub *f;
+
+ foreach_scrub_type(f, hme->e.group.mask, RTG_STRUCTURES) {
+ enum repair_outcome outcome =
+ xfs_repair_metadata(mnt_fd, f->scrub_type,
+ hme->e.group.gno, 0, 0);
+
+ pthread_mutex_lock(&ctx->conlock);
+ report_health_repair(pfx, hme, f->event_mask, outcome);
+ pthread_mutex_unlock(&ctx->conlock);
+ }
+}
+
+/* React to a inode-domain corruption event by repairing it. */
+static void
+try_repair_inode(
+ struct healer_ctx *ctx,
+ const struct hme_prefix *pfx,
+ int mnt_fd,
+ const struct xfs_health_monitor_event *hme)
+{
+#define X(code, type) { XFS_BS_SICK_ ## code, XFS_SCRUB_TYPE_ ## type }
+ static const struct u32_scrub INODE_STRUCTURES[] = {
+ X(INODE, INODE),
+ X(BMBTD, BMBTD),
+ X(BMBTA, BMBTA),
+ X(BMBTC, BMBTC),
+ X(DIR, DIR),
+ X(XATTR, XATTR),
+ X(SYMLINK, SYMLINK),
+ X(PARENT, PARENT),
+ X(DIRTREE, DIRTREE),
+ {0, 0},
+ };
+#undef X
+ const struct u32_scrub *f;
+
+ foreach_scrub_type(f, hme->e.inode.mask, INODE_STRUCTURES) {
+ enum repair_outcome outcome =
+ xfs_repair_metadata(mnt_fd, f->scrub_type,
+ 0, hme->e.inode.ino, hme->e.inode.gen);
+
+ pthread_mutex_lock(&ctx->conlock);
+ report_health_repair(pfx, hme, f->event_mask, outcome);
+ pthread_mutex_unlock(&ctx->conlock);
+ }
+}
+
+/* Repair a metadata corruption. */
+int
+repair_metadata(
+ struct healer_ctx *ctx,
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme)
+{
+ int repair_fd;
+ int ret;
+
+ ret = weakhandle_reopen(ctx->wh, &repair_fd);
+ if (ret) {
+ fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+ _("cannot open filesystem to repair"),
+ strerror(errno));
+ return ret;
+ }
+
+ switch (hme->domain) {
+ case XFS_HEALTH_MONITOR_DOMAIN_FS:
+ try_repair_wholefs(ctx, pfx, repair_fd, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_AG:
+ try_repair_ag(ctx, pfx, repair_fd, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP:
+ try_repair_rtgroup(ctx, pfx, repair_fd, hme);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_INODE:
+ try_repair_inode(ctx, pfx, repair_fd, hme);
+ break;
+ }
+
+ close(repair_fd);
+ return 0;
+}
+
+/* Ask the kernel if it supports repairs. */
+bool
+healer_can_repair(
+ struct healer_ctx *ctx)
+{
+ struct xfs_scrub_metadata sm = {
+ .sm_type = XFS_SCRUB_TYPE_PROBE,
+ .sm_flags = XFS_SCRUB_IFLAG_REPAIR,
+ };
+ int ret;
+
+ /* assume any errno means not supported */
+ ret = ioctl(ctx->mnt.fd, XFS_IOC_SCRUB_METADATA, &sm);
+ return ret ? false : true;
+}
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
new file mode 100644
index 00000000000000..53df43b03e16cc
--- /dev/null
+++ b/healer/weakhandle.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025-2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "platform_defs.h"
+#include "handle.h"
+#include "libfrog/fsgeom.h"
+#include "libfrog/workqueue.h"
+#include "xfs_healer.h"
+
+struct weakhandle {
+ /* Shared reference to the user's mountpoint for logging */
+ const char *mntpoint;
+
+ /* Shared reference to the getmntent fsname for reconnecting */
+ const char *fsname;
+
+ /* handle to root dir */
+ void *hanp;
+ size_t hlen;
+};
+
+/* Capture a handle for a given filesystem, but don't attach to the fd. */
+int
+weakhandle_alloc(
+ int fd,
+ const char *mountpoint,
+ const char *fsname,
+ struct weakhandle **whp)
+{
+ struct weakhandle *wh;
+ int ret;
+
+ *whp = NULL;
+
+ if (fd < 0 || !mountpoint) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ wh = calloc(1, sizeof(struct weakhandle));
+ if (!wh)
+ return -1;
+
+ wh->mntpoint = mountpoint;
+ wh->fsname = fsname;
+
+ ret = fd_to_handle(fd, &wh->hanp, &wh->hlen);
+ if (ret)
+ goto out_wh;
+
+ *whp = wh;
+ return 0;
+
+out_wh:
+ free(wh);
+ return -1;
+}
+
+/* Reopen a file handle obtained via weak reference. */
+int
+weakhandle_reopen(
+ struct weakhandle *wh,
+ int *fd)
+{
+ void *hanp;
+ size_t hlen;
+ int mnt_fd;
+ int ret;
+
+ *fd = -1;
+
+ mnt_fd = open(wh->mntpoint, O_RDONLY);
+ if (mnt_fd < 0)
+ return -1;
+
+ ret = fd_to_handle(mnt_fd, &hanp, &hlen);
+ if (ret)
+ goto out_mntfd;
+
+ if (hlen != wh->hlen || memcmp(hanp, wh->hanp, hlen)) {
+ errno = ESTALE;
+ goto out_handle;
+ }
+
+ free_handle(hanp, hlen);
+ *fd = mnt_fd;
+ return 0;
+
+out_handle:
+ free_handle(hanp, hlen);
+out_mntfd:
+ close(mnt_fd);
+ return -1;
+}
+
+/* Tear down a weak handle */
+void
+weakhandle_free(
+ struct weakhandle **whp)
+{
+ struct weakhandle *wh = *whp;
+
+ if (wh) {
+ free_handle(wh->hanp, wh->hlen);
+ free(wh);
+ }
+
+ *whp = NULL;
+}
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index e0076fff381632..488f2a5310d0fd 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -59,6 +59,18 @@ event_loggable(
return ctx->log || event_not_actionable(hme);
}
+/* Are we going to try a repair? */
+static inline bool
+event_repairable(
+ const struct healer_ctx *ctx,
+ const struct xfs_health_monitor_event *hme)
+{
+ if (event_not_actionable(hme))
+ return false;
+
+ return ctx->want_repair && hme->type == XFS_HEALTH_MONITOR_TYPE_SICK;
+}
+
/* Handle an event asynchronously. */
static void
handle_event(
@@ -70,6 +82,7 @@ handle_event(
struct xfs_health_monitor_event *hme = arg;
struct healer_ctx *ctx = wq->wq_ctx;
const bool loggable = event_loggable(ctx, hme);
+ const bool will_repair = event_repairable(ctx, hme);
hme_prefix_init(&pfx, ctx->mntpoint);
@@ -83,6 +96,10 @@ handle_event(
pthread_mutex_unlock(&ctx->conlock);
}
+ /* Initiate a repair if appropriate. */
+ if (will_repair)
+ repair_metadata(ctx, &pfx, hme);
+
free(hme);
}
@@ -156,6 +173,40 @@ setup_monitor(
goto out_mnt_fd;
}
+ if (ctx->want_repair) {
+ /* Check that the kernel supports repairs at all. */
+ if (!healer_can_repair(ctx)) {
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS online repair is not supported, exiting"));
+ goto out_mnt_fd;
+ }
+
+ /* Check for backref metadata that makes repair effective. */
+ if (!healer_has_rmapbt(ctx))
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS online repair is less effective without rmap btrees."));
+
+ if (!healer_has_parent(ctx))
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("XFS online repair is less effective without parent pointers."));
+
+ }
+
+ /*
+ * Open weak-referenced file handle to mountpoint so that we can
+ * reconnect to the mountpoint to start repairs.
+ */
+ if (ctx->want_repair) {
+ ret = weakhandle_alloc(ctx->mnt.fd, ctx->mntpoint,
+ ctx->fsname, &ctx->wh);
+ if (ret) {
+ fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+ _("creating weak fshandle"),
+ strerror(errno));
+ goto out_mnt_fd;
+ }
+ }
+
/*
* Open the health monitor, then close the mountpoint to avoid pinning
* it. We can reconnect later if need be.
@@ -287,6 +338,7 @@ teardown_monitor(
ctx->mon_fp = NULL;
}
free(ctx->mon_buf);
+ weakhandle_free(&ctx->wh);
ctx->mon_buf = NULL;
}
@@ -301,6 +353,7 @@ usage(void)
fprintf(stderr, _(" --everything Capture all events.\n"));
fprintf(stderr, _(" --foreground Process events as soon as possible.\n"));
fprintf(stderr, _(" --quiet Do not log health events to stdout.\n"));
+ fprintf(stderr, _(" --repair Always repair corrupt metadata.\n"));
fprintf(stderr, _(" -V Print version.\n"));
exit(EXIT_FAILURE);
@@ -312,6 +365,7 @@ enum long_opt_nr {
LOPT_FOREGROUND,
LOPT_HELP,
LOPT_QUIET,
+ LOPT_REPAIR,
LOPT_MAX,
};
@@ -342,6 +396,7 @@ main(
[LOPT_FOREGROUND] = {"foreground", no_argument, &ctx.foreground, 1 },
[LOPT_HELP] = {"help", no_argument, NULL, 0 },
[LOPT_QUIET] = {"quiet", no_argument, &ctx.log, 0 },
+ [LOPT_REPAIR] = {"repair", no_argument, &ctx.want_repair, 1 },
[LOPT_MAX] = {NULL, 0, NULL, 0 },
};
diff --git a/libfrog/flagmap.c b/libfrog/flagmap.c
index 631c4bbc8f1dc0..ce413297780a2a 100644
--- a/libfrog/flagmap.c
+++ b/libfrog/flagmap.c
@@ -44,6 +44,23 @@ mask_to_string(
snprintf(buf, bufsize, "%s0x%llx", tag, mask & ~seen);
}
+/*
+ * Given a mapping of bits to strings and a bitmask, return the string
+ * corresponding to the lowest set bit in the mask.
+ */
+const char *
+lowest_set_mask_string(
+ const struct flag_map *map,
+ unsigned long long mask)
+{
+ for (; map->string; map++) {
+ if (mask & map->flag)
+ return _(map->string);
+ }
+
+ return _("unknown flag");
+}
+
/*
* Given a mapping of values to strings and a value, return the matching string
* or confusion.
diff --git a/libfrog/healthevent.c b/libfrog/healthevent.c
index 8520cb3218fb03..193738332dbd71 100644
--- a/libfrog/healthevent.c
+++ b/libfrog/healthevent.c
@@ -358,3 +358,120 @@ hme_report_event(
break;
}
}
+
+static const char *
+repair_outcome_string(
+ enum repair_outcome o)
+{
+ switch (o) {
+ case REPAIR_FAILED:
+ return _("Repair unsuccessful; offline repair required.");
+ case REPAIR_PROBABLY_OK:
+ return _("Seems correct but cross-referencing failed; offline repair recommended.");
+ case REPAIR_UNNECESSARY:
+ return _("No modification needed.");
+ case REPAIR_SUCCESS:
+ return _("Repairs successful.");
+ }
+
+ return NULL;
+}
+
+/* Report inode metadata repair */
+static void
+report_inode_repair(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t domain_mask,
+ enum repair_outcome outcome)
+{
+ if (hme_prefix_has_path(pfx))
+ printf("%s %s: %s\n",
+ pfx->path,
+ lowest_set_mask_string(inode_structs,
+ domain_mask),
+ repair_outcome_string(outcome));
+ else
+ printf("%s %s %llu %s 0x%x %s: %s\n",
+ pfx->mountpoint,
+ _("ino"),
+ (unsigned long long)hme->e.inode.ino,
+ _("gen"),
+ hme->e.inode.gen,
+ lowest_set_mask_string(inode_structs,
+ domain_mask),
+ repair_outcome_string(outcome));
+ fflush(stdout);
+}
+
+/* Report AG metadata repair */
+static void
+report_ag_repair(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t domain_mask,
+ enum repair_outcome outcome)
+{
+ printf("%s %s 0x%x %s: %s\n", pfx->mountpoint,
+ _("agno"),
+ hme->e.group.gno,
+ lowest_set_mask_string(ag_structs, domain_mask),
+ repair_outcome_string(outcome));
+ fflush(stdout);
+}
+
+/* Report rtgroup metadata repair */
+static void
+report_rtgroup_repair(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t domain_mask,
+ enum repair_outcome outcome)
+{
+ printf("%s %s 0x%x %s: %s\n", pfx->mountpoint,
+ _("rgno"),
+ hme->e.group.gno,
+ lowest_set_mask_string(rtgroup_structs, domain_mask),
+ repair_outcome_string(outcome));
+ fflush(stdout);
+}
+
+/* Report fs-wide metadata repair */
+static void
+report_fs_repair(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t domain_mask,
+ enum repair_outcome outcome)
+{
+ printf("%s %s: %s\n", pfx->mountpoint,
+ lowest_set_mask_string(fs_structs, domain_mask),
+ repair_outcome_string(outcome));
+ fflush(stdout);
+}
+
+/* Log a repair event to stdout. */
+void
+report_health_repair(
+ const struct hme_prefix *pfx,
+ const struct xfs_health_monitor_event *hme,
+ uint32_t domain_mask,
+ enum repair_outcome outcome)
+{
+ switch (hme->domain) {
+ case XFS_HEALTH_MONITOR_DOMAIN_INODE:
+ report_inode_repair(pfx, hme, domain_mask, outcome);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_AG:
+ report_ag_repair(pfx, hme, domain_mask, outcome);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP:
+ report_rtgroup_repair(pfx, hme, domain_mask, outcome);
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_FS:
+ report_fs_repair(pfx, hme, domain_mask, outcome);
+ break;
+ default:
+ break;
+ }
+}
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 12/26] xfs_healer: use getparents to look up file names
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (10 preceding siblings ...)
2026-03-19 4:41 ` [PATCH 11/26] xfs_healer: enable repairing filesystems Darrick J. Wong
@ 2026-03-19 4:41 ` Darrick J. Wong
2026-03-19 4:42 ` [PATCH 13/26] xfs_healer: create a per-mount background monitoring service Darrick J. Wong
` (13 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:41 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
If the kernel tells about something that happened to a file, use the
GETPARENTS ioctl to try to look up the path to that file for more
ergonomic reporting.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 6 ++++
healer/fsrepair.c | 16 ++++++++-
healer/weakhandle.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++
healer/xfs_healer.c | 45 ++++++++++++++++++++++++++-
4 files changed, 149 insertions(+), 4 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index a4de1ad32a408f..6d12921245934c 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -61,6 +61,10 @@ static inline bool healer_has_parent(const struct healer_ctx *ctx)
return ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_PARENT;
}
+void lookup_path(struct healer_ctx *ctx,
+ const struct xfs_health_monitor_event *hme,
+ struct hme_prefix *pfx);
+
/* repair.c */
int repair_metadata(struct healer_ctx *ctx, const struct hme_prefix *pfx,
const struct xfs_health_monitor_event *hme);
@@ -71,5 +75,7 @@ int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname,
struct weakhandle **whp);
int weakhandle_reopen(struct weakhandle *wh, int *fd);
void weakhandle_free(struct weakhandle **whp);
+int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen,
+ char *path, size_t pathlen);
#endif /* XFS_HEALER_XFS_HEALER_H_ */
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 907afca3dba8a7..4534104f8a6ac1 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -164,7 +164,7 @@ try_repair_rtgroup(
static void
try_repair_inode(
struct healer_ctx *ctx,
- const struct hme_prefix *pfx,
+ const struct hme_prefix *orig_pfx,
int mnt_fd,
const struct xfs_health_monitor_event *hme)
{
@@ -182,13 +182,25 @@ try_repair_inode(
{0, 0},
};
#undef X
- const struct u32_scrub *f;
+ struct hme_prefix new_pfx;
+ const struct hme_prefix *pfx = orig_pfx;
+ const struct u32_scrub *f;
foreach_scrub_type(f, hme->e.inode.mask, INODE_STRUCTURES) {
enum repair_outcome outcome =
xfs_repair_metadata(mnt_fd, f->scrub_type,
0, hme->e.inode.ino, hme->e.inode.gen);
+ /*
+ * Try again to find the file path, maybe we fixed the dir
+ * tree.
+ */
+ if (!hme_prefix_has_path(pfx)) {
+ lookup_path(ctx, hme, &new_pfx);
+ if (hme_prefix_has_path(&new_pfx))
+ pfx = &new_pfx;
+ }
+
pthread_mutex_lock(&ctx->conlock);
report_health_repair(pfx, hme, f->event_mask, outcome);
pthread_mutex_unlock(&ctx->conlock);
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 53df43b03e16cc..8950e0eb1e5a43 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -11,6 +11,8 @@
#include "handle.h"
#include "libfrog/fsgeom.h"
#include "libfrog/workqueue.h"
+#include "libfrog/getparents.h"
+#include "libfrog/paths.h"
#include "xfs_healer.h"
struct weakhandle {
@@ -113,3 +115,87 @@ weakhandle_free(
*whp = NULL;
}
+
+struct bufvec {
+ char *buf;
+ size_t len;
+};
+
+static int
+render_path(
+ const char *mntpt,
+ const struct path_list *path,
+ void *arg)
+{
+ struct bufvec *args = arg;
+ int mntpt_len = strlen(mntpt);
+ ssize_t ret;
+
+ /* Trim trailing slashes from the mountpoint */
+ while (mntpt_len > 0 && mntpt[mntpt_len - 1] == '/')
+ mntpt_len--;
+
+ ret = snprintf(args->buf, args->len, "%.*s", mntpt_len, mntpt);
+ if (ret < 0 || ret >= args->len)
+ return 0;
+
+ ret = path_list_to_string(path, args->buf + ret, args->len - ret);
+ if (ret < 0)
+ return 0;
+
+ /* magic code that means we found one */
+ return ECANCELED;
+}
+
+/* Render any path to this weakhandle into the specified buffer. */
+int
+weakhandle_getpath_for(
+ struct weakhandle *wh,
+ uint64_t ino,
+ uint32_t gen,
+ char *path,
+ size_t pathlen)
+{
+ struct xfs_handle fakehandle;
+ struct bufvec bv = {
+ .buf = path,
+ .len = pathlen,
+ };
+ int mnt_fd;
+ int ret;
+
+ if (wh->hlen != sizeof(fakehandle)) {
+ errno = EINVAL;
+ return -1;
+ }
+ memcpy(&fakehandle, wh->hanp, sizeof(fakehandle));
+ fakehandle.ha_fid.fid_ino = ino;
+ fakehandle.ha_fid.fid_gen = gen;
+
+ ret = weakhandle_reopen(wh, &mnt_fd);
+ if (ret)
+ return ret;
+
+ /*
+ * In the common case, files only have one parent; and what's the
+ * chance that we'll need to walk past the second parent to find *one*
+ * path that goes to the rootdir? With a max filename length of 255
+ * bytes, we pick 600 for the buffer size.
+ */
+ ret = handle_walk_paths_fd(wh->mntpoint, mnt_fd, &fakehandle,
+ sizeof(fakehandle), 600, render_path, &bv);
+ switch (ret) {
+ case ECANCELED:
+ /* found a path */
+ ret = 0;
+ break;
+ default:
+ /* didn't find one */
+ errno = ENOENT;
+ ret = -1;
+ break;
+ }
+
+ close(mnt_fd);
+ return ret;
+}
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 488f2a5310d0fd..63baf641cb6ec6 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -34,6 +34,39 @@ open_health_monitor(
return ioctl(mnt_fd, XFS_IOC_HEALTH_MONITOR, &hmo);
}
+/* Report either the file handle or its path, if we can. */
+void
+lookup_path(
+ struct healer_ctx *ctx,
+ const struct xfs_health_monitor_event *hme,
+ struct hme_prefix *pfx)
+{
+ uint64_t ino = 0;
+ uint32_t gen = 0;
+ int ret;
+
+ if (!healer_has_parent(ctx))
+ return;
+
+ switch (hme->domain) {
+ case XFS_HEALTH_MONITOR_DOMAIN_INODE:
+ ino = hme->e.inode.ino;
+ gen = hme->e.inode.gen;
+ break;
+ case XFS_HEALTH_MONITOR_DOMAIN_FILERANGE:
+ ino = hme->e.filerange.ino;
+ gen = hme->e.filerange.gen;
+ break;
+ default:
+ return;
+ }
+
+ ret = weakhandle_getpath_for(ctx->wh, ino, gen, pfx->path,
+ sizeof(pfx->path));
+ if (ret)
+ hme_prefix_clear_path(pfx);
+}
+
/* Decide if this event can only be reported upon, and not acted upon. */
static bool
event_not_actionable(
@@ -86,6 +119,13 @@ handle_event(
hme_prefix_init(&pfx, ctx->mntpoint);
+ /*
+ * Try to look up the file name for the file we're about to log or
+ * about to repair (which always logs).
+ */
+ if (loggable || will_repair)
+ lookup_path(ctx, hme, &pfx);
+
/*
* Non-actionable events should always be logged, because they are 100%
* informational.
@@ -194,9 +234,10 @@ setup_monitor(
/*
* Open weak-referenced file handle to mountpoint so that we can
- * reconnect to the mountpoint to start repairs.
+ * reconnect to the mountpoint to start repairs or to look up file
+ * paths for logging.
*/
- if (ctx->want_repair) {
+ if (ctx->want_repair || healer_has_parent(ctx)) {
ret = weakhandle_alloc(ctx->mnt.fd, ctx->mntpoint,
ctx->fsname, &ctx->wh);
if (ret) {
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 13/26] xfs_healer: create a per-mount background monitoring service
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (11 preceding siblings ...)
2026-03-19 4:41 ` [PATCH 12/26] xfs_healer: use getparents to look up file names Darrick J. Wong
@ 2026-03-19 4:42 ` Darrick J. Wong
2026-03-19 4:42 ` [PATCH 14/26] xfs_healer: create a service to start the per-mount healer service Darrick J. Wong
` (12 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:42 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a systemd service definition for our self-healing filesystem
daemon so that we can run it for every mounted filesystem. Add a
hidden switch so that we can print the service unit name for fstests.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 1
healer/Makefile | 22 ++++++++
healer/system-xfs_healer.slice | 31 ++++++++++++
healer/xfs_healer.c | 16 ++++++
healer/xfs_healer@.service.in | 107 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 176 insertions(+), 1 deletion(-)
create mode 100644 healer/system-xfs_healer.slice
create mode 100644 healer/xfs_healer@.service.in
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index 6d12921245934c..679bdc95ae48f8 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -26,6 +26,7 @@ struct healer_ctx {
int everything;
int foreground;
int want_repair;
+ int print_svcname;
/* fd and fs geometry for mount */
struct xfs_fd mnt;
diff --git a/healer/Makefile b/healer/Makefile
index 981192b81af626..ee44aaee461250 100644
--- a/healer/Makefile
+++ b/healer/Makefile
@@ -22,7 +22,23 @@ LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
LLDFLAGS = -static
-default: depend $(LTCOMMAND)
+XFS_HEALER_SVCNAME=xfs_healer@.service
+CFLAGS += -DXFS_HEALER_SVCNAME=\"$(XFS_HEALER_SVCNAME)\"
+
+ifeq ($(HAVE_SYSTEMD),yes)
+INSTALL_HEALER += install-systemd
+SYSTEMD_SERVICES=\
+ system-xfs_healer.slice \
+ $(XFS_HEALER_SVCNAME)
+OPTIONAL_TARGETS += $(SYSTEMD_SERVICES)
+endif
+
+default: depend $(LTCOMMAND) $(SYSTEMD_SERVICES)
+
+%.service: %.service.in $(builddefs)
+ @echo " [SED] $@"
+ $(Q)$(SED) -e "s|@pkg_libexec_dir@|$(PKG_LIBEXEC_DIR)|g" \
+ < $< > $@
include $(BUILDRULES)
@@ -32,6 +48,10 @@ install-healer: default
$(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
$(INSTALL) -m 755 $(LTCOMMAND) $(PKG_LIBEXEC_DIR)
+install-systemd: default
+ $(INSTALL) -m 755 -d $(SYSTEMD_SYSTEM_UNIT_DIR)
+ $(INSTALL) -m 644 $(SYSTEMD_SERVICES) $(SYSTEMD_SYSTEM_UNIT_DIR)
+
install-dev:
-include .dep
diff --git a/healer/system-xfs_healer.slice b/healer/system-xfs_healer.slice
new file mode 100644
index 00000000000000..b8f5bca03963ff
--- /dev/null
+++ b/healer/system-xfs_healer.slice
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=xfs_healer background service slice
+Before=slices.target
+
+[Slice]
+
+# If the CPU usage cgroup controller is available, don't use more than 2 cores
+# for all background processes. One thread to read events, another to run
+# repairs.
+CPUQuota=200%
+CPUAccounting=true
+
+[Install]
+# As of systemd 249, the systemd cgroupv2 configuration code will drop resource
+# controllers from the root and system.slice cgroups at startup if it doesn't
+# find any direct dependencies that require a given controller. Newly
+# activated units with resource control directives are created under the system
+# slice but do not cause a reconfiguration of the slice's resource controllers.
+# Hence we cannot put CPUQuota= into the xfs_healer service units directly.
+#
+# For the CPUQuota directive to have any effect, we must therefore create an
+# explicit definition file for the slice that systemd creates to contain the
+# xfs_healer instance units (e.g. xfs_healer@.service) and we must configure
+# this slice as a dependency of the system slice to establish the direct
+# dependency relation.
+WantedBy=system.slice
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 63baf641cb6ec6..1a26ffe830e5fe 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -407,6 +407,7 @@ enum long_opt_nr {
LOPT_HELP,
LOPT_QUIET,
LOPT_REPAIR,
+ LOPT_SVCNAME,
LOPT_MAX,
};
@@ -438,6 +439,7 @@ main(
[LOPT_HELP] = {"help", no_argument, NULL, 0 },
[LOPT_QUIET] = {"quiet", no_argument, &ctx.log, 0 },
[LOPT_REPAIR] = {"repair", no_argument, &ctx.want_repair, 1 },
+ [LOPT_SVCNAME] = {"svcname", no_argument, &ctx.print_svcname, 1 },
[LOPT_MAX] = {NULL, 0, NULL, 0 },
};
@@ -474,6 +476,20 @@ main(
ctx.mntpoint = argv[optind];
+ if (ctx.print_svcname) {
+ char unitname[PATH_MAX];
+
+ ret = systemd_path_instance_unit_name(XFS_HEALER_SVCNAME,
+ ctx.mntpoint, unitname, sizeof(unitname));
+ if (ret) {
+ perror(ctx.mntpoint);
+ return EXIT_FAILURE;
+ }
+
+ printf("%s\n", unitname);
+ return EXIT_SUCCESS;
+ }
+
ret = setup_monitor(&ctx);
if (ret)
goto out_events;
diff --git a/healer/xfs_healer@.service.in b/healer/xfs_healer@.service.in
new file mode 100644
index 00000000000000..385257872b0cbb
--- /dev/null
+++ b/healer/xfs_healer@.service.in
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (c) 2024-2026 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Self Healing of XFS Metadata for %f
+
+# Explicitly require the capabilities that this program needs
+ConditionCapability=CAP_SYS_ADMIN
+ConditionCapability=CAP_DAC_OVERRIDE
+
+# Must be a mountpoint
+ConditionPathIsMountPoint=%f
+RequiresMountsFor=%f
+
+[Service]
+Type=exec
+Environment=SERVICE_MODE=1
+ExecStart=@pkg_libexec_dir@/xfs_healer %f
+SyslogIdentifier=%N
+
+# Create the service underneath the healer background service slice so that we
+# can control resource usage.
+Slice=system-xfs_healer.slice
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# xfs_healer avoids pinning mounted filesystems by recording the file handle
+# for the provided mountpoint (%f) before opening the health monitor, after
+# which it closes the fd for the mountpoint. If repairs are needed, it will
+# reopen the mountpoint, resample the file handle, and proceed only if the
+# handles match. If the filesystem is unmounted, the daemon exits. If the
+# mountpoint moves, repairs will not be attempted against the wrong filesystem.
+#
+# Due to this resampling behavior, xfs_healer must see the same filesystem
+# mount tree inside the service container as outside, with the same ro/rw
+# state. BindPaths doesn't work on the paths that are made readonly by
+# ProtectSystem and ProtectHome, so it is not possible to set either option.
+# DynamicUser sets ProtectSystem, so that also cannot be used. We cannot use
+# BindPaths to bind the desired mountpoint somewhere under /tmp like xfs_scrub
+# does because that pins the mount.
+#
+# Regrettably, this leaves xfs_healer less hardened than xfs_scrub.
+# Surprisingly, this doesn't affect xfs_healer's score dramatically.
+DynamicUser=false
+ProtectSystem=false
+ProtectHome=no
+PrivateTmp=true
+PrivateDevices=true
+
+# Don't let healer complain about paths in /etc/projects that have been hidden
+# by our sandboxing. healer doesn't care about project ids anyway.
+InaccessiblePaths=-/etc/projects
+
+# No network access
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=none
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and only enough to get things going
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+
+# xfs_healer needs these privileges to open the rootdir and monitor
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+AmbientCapabilities=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+NoNewPrivileges=true
+
+# xfs_healer doesn't create files
+UMask=7777
+
+# No access to hardware /dev files except for block devices
+ProtectClock=true
+DevicePolicy=closed
+
+[Install]
+WantedBy=multi-user.target
+# If someone tries to enable the template itself, translate that into enabling
+# this service on the root directory at systemd startup time. In the
+# initramfs, the udev rules in xfs_healer.rules run before systemd starts.
+DefaultInstance=-
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 14/26] xfs_healer: create a service to start the per-mount healer service
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (12 preceding siblings ...)
2026-03-19 4:42 ` [PATCH 13/26] xfs_healer: create a per-mount background monitoring service Darrick J. Wong
@ 2026-03-19 4:42 ` Darrick J. Wong
2026-03-19 4:42 ` [PATCH 15/26] xfs_healer: don't start service if kernel support unavailable Darrick J. Wong
` (11 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:42 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a daemon to wait for xfs mount events via fsnotify and start up
the per-mount healer service. It's important that we're running in the
same mount namespace as the mount, so we're a fanotify client to avoid
having to filter the mount namespaces ourselves.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
libfrog/systemd.h | 23 +-
configure.ac | 1
healer/Makefile | 16 +-
healer/xfs_healer_start.c | 368 ++++++++++++++++++++++++++++++++++++
healer/xfs_healer_start.service.in | 85 ++++++++
include/builddefs.in | 5
m4/package_libcdev.m4 | 24 ++
7 files changed, 511 insertions(+), 11 deletions(-)
create mode 100644 healer/xfs_healer_start.c
create mode 100644 healer/xfs_healer_start.service.in
diff --git a/libfrog/systemd.h b/libfrog/systemd.h
index c96df4afa39aa6..8a0970282d1080 100644
--- a/libfrog/systemd.h
+++ b/libfrog/systemd.h
@@ -22,6 +22,20 @@ static inline bool systemd_is_service(void)
return getenv("SERVICE_MODE") != NULL;
}
+/* Special processing for a service/daemon program that is exiting. */
+static inline int
+systemd_service_exit_now(int ret)
+{
+ /*
+ * If we're being run as a service, the return code must fit the LSB
+ * init script action error guidelines, which is to say that we
+ * compress all errors to 1 ("generic or unspecified error", LSB 5.0
+ * section 22.2) and hope the admin will scan the log for what actually
+ * happened.
+ */
+ return ret != 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
/* Special processing for a service/daemon program that is exiting. */
static inline int
systemd_service_exit(int ret)
@@ -35,14 +49,7 @@ systemd_service_exit(int ret)
*/
sleep(2);
- /*
- * If we're being run as a service, the return code must fit the LSB
- * init script action error guidelines, which is to say that we
- * compress all errors to 1 ("generic or unspecified error", LSB 5.0
- * section 22.2) and hope the admin will scan the log for what actually
- * happened.
- */
- return ret != 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+ return systemd_service_exit_now(ret);
}
#endif /* __LIBFROG_SYSTEMD_H__ */
diff --git a/configure.ac b/configure.ac
index 90af1f84035ee6..e098cf0530415b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -194,6 +194,7 @@ if test "$have_listmount" = "yes"; then
AC_HAVE_LISTMOUNT_NS_FD
AC_HAVE_STATMOUNT_SUPPORTED_MASK
fi
+AC_HAVE_FANOTIFY_MOUNTINFO
if test "$enable_ubsan" = "yes" || test "$enable_ubsan" = "probe"; then
AC_PACKAGE_CHECK_UBSAN
diff --git a/healer/Makefile b/healer/Makefile
index ee44aaee461250..1eeb727682008b 100644
--- a/healer/Makefile
+++ b/healer/Makefile
@@ -9,6 +9,7 @@ include $(builddefs)
INSTALL_HEALER = install-healer
LTCOMMAND = xfs_healer
+BUILD_TARGETS = $(LTCOMMAND)
CFILES = \
fsrepair.c \
@@ -31,9 +32,18 @@ SYSTEMD_SERVICES=\
system-xfs_healer.slice \
$(XFS_HEALER_SVCNAME)
OPTIONAL_TARGETS += $(SYSTEMD_SERVICES)
-endif
+endif # HAVE_SYSTEMD
-default: depend $(LTCOMMAND) $(SYSTEMD_SERVICES)
+ifeq ($(HAVE_HEALER_START_DEPS),yes)
+BUILD_TARGETS += xfs_healer_start
+SYSTEMD_SERVICES += xfs_healer_start.service
+endif # xfs_healer_start deps
+
+default: depend $(BUILD_TARGETS) $(SYSTEMD_SERVICES)
+
+xfs_healer_start: $(SUBDIRS) xfs_healer_start.o $(LTDEPENDENCIES)
+ @echo " [LD] $@"
+ $(Q)$(LTLINK) -o $@ $(LDFLAGS) xfs_healer_start.o $(LDLIBS)
%.service: %.service.in $(builddefs)
@echo " [SED] $@"
@@ -46,7 +56,7 @@ install: $(INSTALL_HEALER)
install-healer: default
$(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
- $(INSTALL) -m 755 $(LTCOMMAND) $(PKG_LIBEXEC_DIR)
+ $(INSTALL) -m 755 $(BUILD_TARGETS) $(PKG_LIBEXEC_DIR)
install-systemd: default
$(INSTALL) -m 755 -d $(SYSTEMD_SYSTEM_UNIT_DIR)
diff --git a/healer/xfs_healer_start.c b/healer/xfs_healer_start.c
new file mode 100644
index 00000000000000..c016e915da79a4
--- /dev/null
+++ b/healer/xfs_healer_start.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include <errno.h>
+#include <err.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/fanotify.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <limits.h>
+
+#include "platform_defs.h"
+#include "libfrog/systemd.h"
+#include "libfrog/statmount.h"
+
+static int debug = 0;
+static const char *progname = "xfs_healer_start";
+
+/* Start the xfs_healer service for a given mountpoint. */
+static void
+start_healer(
+ const char *mntpoint)
+{
+ char unitname[PATH_MAX];
+ int ret;
+
+ ret = systemd_path_instance_unit_name(XFS_HEALER_SVCNAME, mntpoint,
+ unitname, PATH_MAX);
+ if (ret) {
+ fprintf(stderr, "%s: %s\n", mntpoint,
+ _("Could not determine xfs_healer unit name."));
+ return;
+ }
+
+ /*
+ * Restart so that we aren't foiled by an existing unit that's slowly
+ * working its way off a cycled mount.
+ */
+ ret = systemd_manage_unit(UM_RESTART, unitname);
+ if (ret) {
+ fprintf(stderr, "%s: %s: %s\n", mntpoint,
+ _("Could not start xfs_healer service unit"),
+ unitname);
+ return;
+ }
+
+ printf("%s: %s\n", mntpoint, _("xfs_healer service started."));
+ fflush(stdout);
+}
+
+#define REQUIRED_STATMOUNT_FIELDS (STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_MNT_ROOT)
+
+/* Process a newly discovered mountpoint. */
+static void
+examine_mount(
+ int mnt_ns_fd,
+ uint64_t mnt_id)
+{
+ size_t smbuf_size = libfrog_statmount_sizeof(4096);
+ struct statmount *smbuf = alloca(smbuf_size);
+ int ret;
+
+ ret = libfrog_statmount(mnt_id, mnt_ns_fd, REQUIRED_STATMOUNT_FIELDS,
+ smbuf, smbuf_size);
+ if (ret) {
+ perror("statmount");
+ return;
+ }
+
+ if (debug) {
+ printf("mount: id 0x%llx fstype %s mountpoint %s mntroot %s\n",
+ (unsigned long long)mnt_id,
+ (smbuf->mask & STATMOUNT_FS_TYPE) ?
+ smbuf->str + smbuf->fs_type : "null",
+ (smbuf->mask & STATMOUNT_MNT_POINT) ?
+ smbuf->str + smbuf->mnt_point : "null",
+ (smbuf->mask & STATMOUNT_MNT_ROOT) ?
+ smbuf->str + smbuf->mnt_root : "null");
+ fflush(stdout);
+ }
+
+ /* Look for mount points for the root dir of an XFS filesystem. */
+ if ((smbuf->mask & REQUIRED_STATMOUNT_FIELDS) !=
+ REQUIRED_STATMOUNT_FIELDS)
+ return;
+
+ if (!strcmp(smbuf->str + smbuf->fs_type, "xfs") &&
+ !strcmp(smbuf->str + smbuf->mnt_root, "/"))
+ start_healer(smbuf->str + smbuf->mnt_point);
+}
+
+/* Translate fanotify mount events into something we can process. */
+static void
+handle_mount_event(
+ const struct fanotify_event_metadata *event,
+ int mnt_ns_fd)
+{
+ const struct fanotify_event_info_header *info;
+ const struct fanotify_event_info_mnt *mnt;
+ int off;
+
+ if (event->fd != FAN_NOFD) {
+ if (debug)
+ fprintf(stderr, "Expected FAN_NOFD, got fd=%d\n",
+ event->fd);
+ return;
+ }
+
+ switch (event->mask) {
+ case FAN_MNT_ATTACH:
+ if (debug) {
+ printf("FAN_MNT_ATTACH (len=%d)\n", event->event_len);
+ fflush(stdout);
+ }
+ break;
+ default:
+ /* should never get here */
+ return;
+ }
+
+ for (off = sizeof(*event) ; off < event->event_len;
+ off += info->len) {
+ info = (struct fanotify_event_info_header *)
+ ((char *) event + off);
+
+ switch (info->info_type) {
+ case FAN_EVENT_INFO_TYPE_MNT:
+ mnt = (struct fanotify_event_info_mnt *) info;
+
+ if (debug) {
+ printf( "Mount record: len=%d mnt_id=0x%llx\n",
+ mnt->hdr.len, mnt->mnt_id);
+ fflush(stdout);
+ }
+
+ examine_mount(mnt_ns_fd, mnt->mnt_id);
+ break;
+
+ default:
+ if (debug)
+ fprintf(stderr,
+ "Unexpected fanotify event info_type=%d len=%d\n",
+ info->info_type, info->len);
+ break;
+ }
+ }
+}
+
+/* Extract mount attachment notifications from fanotify. */
+static void
+handle_notifications(
+ char *buffer,
+ ssize_t len,
+ int mnt_ns_fd)
+{
+ struct fanotify_event_metadata *event =
+ (struct fanotify_event_metadata *) buffer;
+
+ for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
+
+ switch (event->mask) {
+ case FAN_MNT_ATTACH:
+ handle_mount_event(event, mnt_ns_fd);
+ break;
+ default:
+ if (debug)
+ fprintf(stderr,
+ "Unexpected fanotify mark: 0x%llx\n",
+ (unsigned long long)event->mask);
+ break;
+ }
+ }
+}
+
+#define NR_MNT_IDS (32)
+
+/* Start healer services for existing XFS mounts. */
+static int
+start_existing_mounts(
+ int mnt_ns_fd)
+{
+ uint64_t mnt_ids[NR_MNT_IDS];
+ uint64_t cursor = LISTMOUNT_INIT_CURSOR;
+ int i;
+ int ret;
+
+ while ((ret = libfrog_listmount(LSMT_ROOT, mnt_ns_fd, &cursor,
+ mnt_ids, NR_MNT_IDS)) > 0) {
+ for (i = 0; i < ret; i++)
+ examine_mount(mnt_ns_fd, mnt_ids[i]);
+ }
+
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ fprintf(stderr, "%s\n",
+ _("This program requires the listmount system call."));
+ else
+ perror("listmount");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void __attribute__((noreturn))
+usage(void)
+{
+ fprintf(stderr, "%s %s %s\n", _("Usage:"), progname, _("[OPTIONS]"));
+ fprintf(stderr, "\n");
+ fprintf(stderr, _("Options:\n"));
+ fprintf(stderr, _(" --debug Enable debugging messages.\n"));
+ fprintf(stderr, _(" --mountns Path to the mount namespace file.\n"));
+ fprintf(stderr, _(" --supported Make sure we can actually run.\n"));
+ fprintf(stderr, _(" -V Print version.\n"));
+
+ exit(EXIT_FAILURE);
+}
+
+enum long_opt_nr {
+ LOPT_DEBUG,
+ LOPT_HELP,
+ LOPT_MOUNTNS,
+ LOPT_SUPPORTED,
+
+ LOPT_MAX,
+};
+
+int
+main(
+ int argc,
+ char *argv[])
+{
+ char buffer[BUFSIZ];
+ const char *mntns = NULL;
+ int mnt_ns_fd;
+ int fan_fd;
+ int c;
+ int option_index;
+ int support_check = 0;
+ int ret = 0;
+
+ struct option long_options[] = {
+ [LOPT_SUPPORTED] = {"supported", no_argument, &support_check, 1 },
+ [LOPT_DEBUG] = {"debug", no_argument, &debug, 1 },
+ [LOPT_HELP] = {"help", no_argument, NULL, 0 },
+ [LOPT_MOUNTNS] = {"mountns", required_argument, NULL, 0 },
+ [LOPT_MAX] = {NULL, 0, NULL, 0 },
+ };
+
+ while ((c = getopt_long(argc, argv, "V", long_options, &option_index))
+ != EOF) {
+ switch (c) {
+ case 0:
+ switch (option_index) {
+ case LOPT_MOUNTNS:
+ mntns = optarg;
+ break;
+ case LOPT_HELP:
+ usage();
+ break;
+ default:
+ break;
+ }
+ break;
+ case 'V':
+ fprintf(stdout, "%s %s %s\n", progname, _("version"),
+ VERSION);
+ fflush(stdout);
+ return EXIT_SUCCESS;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ /*
+ * Try to open the mount namespace file for the current process.
+ * fanotify requires this mount namespace file to send mount attachment
+ * events, so this is required for correct functionality.
+ */
+ mnt_ns_fd = open(mntns ? mntns : DEFAULT_MOUNTNS_FILE, O_RDONLY);
+ if (mnt_ns_fd < 0) {
+ if (errno == ENOENT && !mntns) {
+ perror(DEFAULT_MOUNTNS_FILE);
+ fprintf(stderr, "%s\n",
+ _("This program requires mount namespace support."));
+ } else {
+ perror(mntns ? mntns : DEFAULT_MOUNTNS_FILE);
+ }
+ ret = 1;
+ goto out;
+ }
+ if (mnt_ns_fd == DEFAULT_MOUNTNS_FD && mntns != NULL) {
+ /*
+ * We specified a path to a mount namespace file but got fd 0,
+ * which (for listmount and statmount) means to use the current
+ * process' mount namespace. That's probably not what the user
+ * wanted.
+ */
+ fprintf(stderr,
+ _("%s: got bad file descriptor for mount namespace\n"),
+ mntns);
+ ret = 1;
+ goto out;
+ }
+
+ fan_fd = fanotify_init(FAN_REPORT_MNT, O_RDONLY);
+ if (fan_fd < 0) {
+ perror("fanotify_init");
+ if (errno == EINVAL)
+ fprintf(stderr, "%s\n",
+ _("This program requires fanotify mount event support."));
+ ret = 1;
+ goto out;
+ }
+
+ ret = fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH, mnt_ns_fd, NULL);
+ if (ret) {
+ perror("fanotify_mark");
+ goto out;
+ }
+
+ if (support_check) {
+ /*
+ * We're being run as an ExecCondition process and we've
+ * decided to start the main service. There is no need to wait
+ * for journald because the ExecStart version of ourselves will
+ * take care of the waiting for us.
+ */
+ return systemd_service_exit_now(0);
+ }
+
+ if (debug) {
+ printf("fanotify active\n");
+ fflush(stdout);
+ }
+
+ ret = start_existing_mounts(mnt_ns_fd);
+ if (ret)
+ goto out;
+
+ while (1) {
+ ssize_t bytes_read = read(fan_fd, buffer, BUFSIZ);
+
+ if (bytes_read < 0) {
+ perror("fanotify");
+ ret = 1;
+ break;
+ }
+
+ handle_notifications(buffer, bytes_read, mnt_ns_fd);
+ }
+
+out:
+ return systemd_service_exit(ret);
+}
diff --git a/healer/xfs_healer_start.service.in b/healer/xfs_healer_start.service.in
new file mode 100644
index 00000000000000..6fd34eafa48c33
--- /dev/null
+++ b/healer/xfs_healer_start.service.in
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Copyright (c) 2026 Oracle. All Rights Reserved.
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+[Unit]
+Description=Start Self Healing of XFS Metadata
+
+[Service]
+Type=exec
+Environment=SERVICE_MODE=1
+ExecCondition=@pkg_libexec_dir@/xfs_healer_start --supported
+ExecStart=@pkg_libexec_dir@/xfs_healer_start
+
+# This service starts more services, so we want it to try to restart any time
+# the program exits or crashes.
+Restart=on-failure
+
+# Create the service underneath the healer background service slice so that we
+# can control resource usage.
+Slice=system-xfs_healer.slice
+
+# No realtime CPU scheduling
+RestrictRealtime=true
+
+# Must run with full privileges in a shared mount namespace so that we can
+# see new mounts and tell systemd to start the per-mount healer service.
+DynamicUser=false
+ProtectSystem=false
+ProtectHome=no
+PrivateTmp=true
+PrivateDevices=true
+
+# Don't let healer complain about paths in /etc/projects that have been hidden
+# by our sandboxing. healer doesn't care about project ids anyway.
+InaccessiblePaths=-/etc/projects
+
+# No network access except to the systemd control socket
+PrivateNetwork=true
+ProtectHostname=true
+RestrictAddressFamilies=AF_UNIX
+IPAddressDeny=any
+
+# Don't let the program mess with the kernel configuration at all
+ProtectKernelLogs=true
+ProtectKernelModules=true
+ProtectKernelTunables=true
+ProtectControlGroups=true
+ProtectProc=invisible
+RestrictNamespaces=true
+
+# Hide everything in /proc, even /proc/mounts
+ProcSubset=pid
+
+# Only allow the default personality Linux
+LockPersonality=true
+
+# No writable memory pages
+MemoryDenyWriteExecute=true
+
+# Don't let our mounts leak out to the host
+PrivateMounts=true
+
+# Restrict system calls to the native arch and fanotify
+SystemCallArchitectures=native
+SystemCallFilter=@system-service
+SystemCallFilter=~@privileged
+SystemCallFilter=~@resources
+SystemCallFilter=~@mount
+SystemCallFilter=fanotify_init fanotify_mark
+
+# xfs_healer_start needs these privileges to open the rootdir and monitor
+CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+AmbientCapabilities=CAP_SYS_ADMIN CAP_DAC_OVERRIDE
+NoNewPrivileges=true
+
+# xfs_healer_start doesn't create files
+UMask=7777
+
+# No access to hardware /dev files except for block devices
+ProtectClock=true
+DevicePolicy=closed
+
+[Install]
+WantedBy=multi-user.target
diff --git a/include/builddefs.in b/include/builddefs.in
index 0ab2bf1702f0f0..bdba9cd9037900 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -124,6 +124,7 @@ HAVE_LISTMOUNT = @have_listmount@
HAVE_LISTMOUNT_NS_FD = @have_listmount_ns_fd@
HAVE_STATMOUNT_SUPPORTED_MASK = @have_statmount_supported_mask@
NEED_INTERNAL_STATMOUNT = @need_internal_statmount@
+HAVE_FANOTIFY_MOUNTINFO = @have_fanotify_mountinfo@
GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall
# -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl
@@ -159,6 +160,10 @@ ifeq ($(HAVE_LIBURCU_ATOMIC64),yes)
PCFLAGS += -DHAVE_LIBURCU_ATOMIC64
endif
+ifeq ($(ENABLE_HEALER)$(HAVE_SYSTEMD)$(HAVE_LISTMOUNT)$(HAVE_FANOTIFY_MOUNTINFO),yesyesyesyes)
+HAVE_HEALER_START_DEPS = yes
+endif
+
SANITIZER_CFLAGS += @addrsan_cflags@ @threadsan_cflags@ @ubsan_cflags@ @autovar_init_cflags@
SANITIZER_LDFLAGS += @addrsan_ldflags@ @threadsan_ldflags@ @ubsan_ldflags@
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index ec4a3ef444b705..9586bc01fe0f25 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -452,3 +452,27 @@ AC_DEFUN([AC_HAVE_STATMOUNT_SUPPORTED_MASK],
AC_SUBST(have_statmount_supported_mask)
AC_SUBST(need_internal_statmount)
])
+
+#
+# Check if fanotify will give us mount notifications (6.15).
+#
+AC_DEFUN([AC_HAVE_FANOTIFY_MOUNTINFO],
+ [AC_MSG_CHECKING([for fanotify mount events])
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/fanotify.h>
+ ]], [[
+ struct fanotify_event_info_mnt info;
+
+ int fan_fd = fanotify_init(FAN_REPORT_MNT, 0);
+ fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_MNTNS, FAN_MNT_ATTACH,
+ -1, NULL);
+ ]])
+ ], have_fanotify_mountinfo=yes
+ AC_MSG_RESULT(yes),
+ AC_MSG_RESULT(no))
+ AC_SUBST(have_fanotify_mountinfo)
+ ])
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 15/26] xfs_healer: don't start service if kernel support unavailable
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (13 preceding siblings ...)
2026-03-19 4:42 ` [PATCH 14/26] xfs_healer: create a service to start the per-mount healer service Darrick J. Wong
@ 2026-03-19 4:42 ` Darrick J. Wong
2026-03-19 4:42 ` [PATCH 16/26] xfs_healer: use the autofsck fsproperty to select mode Darrick J. Wong
` (10 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:42 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Use ExecCondition= in the system service to check if kernel support for
the health monitor is available. If not, we don't want to run the
service, have it fail, and generate a bunch of silly log messages.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 1 +
healer/xfs_healer.c | 47 ++++++++++++++++++++++++++++++++---------
healer/xfs_healer@.service.in | 1 +
3 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index 679bdc95ae48f8..7caa6c66a59c6f 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -27,6 +27,7 @@ struct healer_ctx {
int foreground;
int want_repair;
int print_svcname;
+ int support_check;
/* fd and fs geometry for mount */
struct xfs_fd mnt;
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 1a26ffe830e5fe..8c48d2d9ee8c2d 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -191,8 +191,14 @@ healer_nproc(
return ctx->foreground ? platform_nproc() : 1;
}
+enum mon_state {
+ MON_START,
+ MON_EXIT,
+ MON_ERROR,
+};
+
/* Set ourselves up to monitor the given mountpoint for health events. */
-static int
+static enum mon_state
setup_monitor(
struct healer_ctx *ctx)
{
@@ -203,7 +209,7 @@ setup_monitor(
ret = xfd_open(&ctx->mnt, ctx->mntpoint, O_RDONLY);
if (ret) {
perror(ctx->mntpoint);
- return -1;
+ return MON_ERROR;
}
ret = try_capture_fsinfo(ctx);
@@ -274,6 +280,16 @@ setup_monitor(
close(ctx->mnt.fd);
ctx->mnt.fd = -1;
+ /*
+ * At this point, we know that the kernel is capable of repairing the
+ * filesystem and telling us that it needs repairs. If the user only
+ * wanted us to check for the capability, we're done.
+ */
+ if (ctx->support_check) {
+ close(mon_fd);
+ return MON_EXIT;
+ }
+
/*
* mon_fp consumes mon_fd. We intentionally leave mon_fp attached to
* the context so that we keep the monitoring fd open until we've torn
@@ -305,7 +321,7 @@ setup_monitor(
}
ctx->queue_active = true;
- return 0;
+ return MON_START;
out_mon_fp:
if (ctx->mon_fp)
@@ -318,7 +334,7 @@ setup_monitor(
if (ctx->mnt.fd >= 0)
close(ctx->mnt.fd);
ctx->mnt.fd = -1;
- return -1;
+ return MON_ERROR;
}
/* Monitor the given mountpoint for health events. */
@@ -395,6 +411,7 @@ usage(void)
fprintf(stderr, _(" --foreground Process events as soon as possible.\n"));
fprintf(stderr, _(" --quiet Do not log health events to stdout.\n"));
fprintf(stderr, _(" --repair Always repair corrupt metadata.\n"));
+ fprintf(stderr, _(" --supported Check that health monitoring is supported.\n"));
fprintf(stderr, _(" -V Print version.\n"));
exit(EXIT_FAILURE);
@@ -407,6 +424,7 @@ enum long_opt_nr {
LOPT_HELP,
LOPT_QUIET,
LOPT_REPAIR,
+ LOPT_SUPPORTED,
LOPT_SVCNAME,
LOPT_MAX,
@@ -439,6 +457,7 @@ main(
[LOPT_HELP] = {"help", no_argument, NULL, 0 },
[LOPT_QUIET] = {"quiet", no_argument, &ctx.log, 0 },
[LOPT_REPAIR] = {"repair", no_argument, &ctx.want_repair, 1 },
+ [LOPT_SUPPORTED] = {"supported", no_argument, &ctx.support_check, 1 },
[LOPT_SVCNAME] = {"svcname", no_argument, &ctx.print_svcname, 1 },
[LOPT_MAX] = {NULL, 0, NULL, 0 },
@@ -490,14 +509,22 @@ main(
return EXIT_SUCCESS;
}
- ret = setup_monitor(&ctx);
- if (ret)
- goto out_events;
+ switch (setup_monitor(&ctx)) {
+ case MON_ERROR:
+ ret = -1;
+ break;
+ case MON_EXIT:
+ ret = 0;
+ break;
+ case MON_START:
+ ret = 0;
+ monitor(&ctx);
+ break;
+ }
- monitor(&ctx);
-
-out_events:
teardown_monitor(&ctx);
free((char *)ctx.fsname);
+ if (ctx.support_check)
+ return systemd_service_exit_now(ret);
return systemd_service_exit(ret);
}
diff --git a/healer/xfs_healer@.service.in b/healer/xfs_healer@.service.in
index 385257872b0cbb..53f89cf9c4333d 100644
--- a/healer/xfs_healer@.service.in
+++ b/healer/xfs_healer@.service.in
@@ -17,6 +17,7 @@ RequiresMountsFor=%f
[Service]
Type=exec
Environment=SERVICE_MODE=1
+ExecCondition=@pkg_libexec_dir@/xfs_healer --supported %f
ExecStart=@pkg_libexec_dir@/xfs_healer %f
SyslogIdentifier=%N
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 16/26] xfs_healer: use the autofsck fsproperty to select mode
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (14 preceding siblings ...)
2026-03-19 4:42 ` [PATCH 15/26] xfs_healer: don't start service if kernel support unavailable Darrick J. Wong
@ 2026-03-19 4:42 ` Darrick J. Wong
2026-03-19 4:43 ` [PATCH 17/26] xfs_healer: run full scrub after lost corruption events or targeted repair failure Darrick J. Wong
` (9 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:42 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Make the xfs_healer background service query the autofsck filesystem
property to figure out which operating mode it should use.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 1
libfrog/fsproperties.h | 5 ++
healer/xfs_healer.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 105 insertions(+), 3 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index 7caa6c66a59c6f..a2a46053928e33 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -28,6 +28,7 @@ struct healer_ctx {
int want_repair;
int print_svcname;
int support_check;
+ int autofsck;
/* fd and fs geometry for mount */
struct xfs_fd mnt;
diff --git a/libfrog/fsproperties.h b/libfrog/fsproperties.h
index 11d6530bc9a6d6..1cf90d058765b2 100644
--- a/libfrog/fsproperties.h
+++ b/libfrog/fsproperties.h
@@ -52,6 +52,11 @@ bool fsprop_validate(const char *name, const char *value);
#define FSPROP_AUTOFSCK_NAME "autofsck"
+/* filesystem property name for fgetxattr */
+#define VFS_FSPROP_AUTOFSCK_NAME (FSPROP_NAMESPACE \
+ FSPROP_NAME_PREFIX \
+ FSPROP_AUTOFSCK_NAME)
+
enum fsprop_autofsck {
FSPROP_AUTOFSCK_UNSET = 0, /* do not set property */
FSPROP_AUTOFSCK_NONE, /* no background scrubs */
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 8c48d2d9ee8c2d..f4bee495979324 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include <pthread.h>
#include <stdlib.h>
+#include <sys/xattr.h>
#include "platform_defs.h"
#include "libfrog/fsgeom.h"
@@ -13,6 +14,7 @@
#include "libfrog/healthevent.h"
#include "libfrog/workqueue.h"
#include "libfrog/systemd.h"
+#include "libfrog/fsproperties.h"
#include "xfs_healer.h"
/* Program name; needed for libfrog error reports. */
@@ -191,6 +193,63 @@ healer_nproc(
return ctx->foreground ? platform_nproc() : 1;
}
+enum want_repair {
+ WR_REPAIR,
+ WR_LOG_ONLY,
+ WR_EXIT,
+};
+
+/* Determine want_repair from the autofsck filesystem property. */
+static enum want_repair
+want_repair_from_autofsck(
+ struct healer_ctx *ctx)
+{
+ char valuebuf[FSPROP_MAX_VALUELEN + 1] = { 0 };
+ enum fsprop_autofsck shval;
+ ssize_t ret;
+
+ /*
+ * Any OS error (including ENODATA) or string parsing error is treated
+ * the same as an unrecognized value.
+ */
+ ret = fgetxattr(ctx->mnt.fd, VFS_FSPROP_AUTOFSCK_NAME, valuebuf,
+ FSPROP_MAX_VALUELEN);
+ if (ret < 0)
+ goto no_advice;
+
+ shval = fsprop_autofsck_read(valuebuf);
+ switch (shval) {
+ case FSPROP_AUTOFSCK_NONE:
+ /* don't run at all */
+ ret = WR_EXIT;
+ break;
+ case FSPROP_AUTOFSCK_CHECK:
+ case FSPROP_AUTOFSCK_OPTIMIZE:
+ /* log events, do not repair */
+ ret = WR_LOG_ONLY;
+ break;
+ case FSPROP_AUTOFSCK_REPAIR:
+ /* repair stuff */
+ ret = WR_REPAIR;
+ break;
+ case FSPROP_AUTOFSCK_UNSET:
+ goto no_advice;
+ }
+
+ return ret;
+
+no_advice:
+ /*
+ * For an unrecognized value, log but do not fix runtime corruption if
+ * backref metadata are enabled. If no backref metadata are available,
+ * the fs is too old so don't run at all.
+ */
+ if (healer_has_rmapbt(ctx) || healer_has_parent(ctx))
+ return WR_LOG_ONLY;
+
+ return WR_EXIT;
+}
+
enum mon_state {
MON_START,
MON_EXIT,
@@ -219,14 +278,45 @@ setup_monitor(
goto out_mnt_fd;
}
- if (ctx->want_repair) {
- /* Check that the kernel supports repairs at all. */
- if (!healer_can_repair(ctx)) {
+ if (ctx->autofsck) {
+ switch (want_repair_from_autofsck(ctx)) {
+ case WR_EXIT:
+ printf("%s: %s\n", ctx->mntpoint,
+ _("Disabling daemon per autofsck directive."));
+ fflush(stdout);
+ close(ctx->mnt.fd);
+ return MON_EXIT;
+ case WR_REPAIR:
+ ctx->want_repair = 1;
+ printf("%s: %s\n", ctx->mntpoint,
+ _("Automatically repairing per autofsck directive."));
+ fflush(stdout);
+ break;
+ case WR_LOG_ONLY:
+ ctx->want_repair = 0;
+ ctx->log = 1;
+ printf("%s: %s\n", ctx->mntpoint,
+ _("Only logging errors per autofsck directive."));
+ fflush(stdout);
+ break;
+ }
+ }
+
+ /* Check that the kernel supports repairs at all. */
+ if (ctx->want_repair && !healer_can_repair(ctx)) {
+ if (!ctx->autofsck) {
fprintf(stderr, "%s: %s\n", ctx->mntpoint,
_("XFS online repair is not supported, exiting"));
goto out_mnt_fd;
}
+ printf("%s: %s\n", ctx->mntpoint,
+ _("XFS online repair is not supported, will report only"));
+ fflush(stdout);
+ ctx->want_repair = 0;
+ }
+
+ if (ctx->want_repair) {
/* Check for backref metadata that makes repair effective. */
if (!healer_has_rmapbt(ctx))
fprintf(stderr, "%s: %s\n", ctx->mntpoint,
@@ -409,6 +499,7 @@ usage(void)
fprintf(stderr, _(" --debug Enable debugging messages.\n"));
fprintf(stderr, _(" --everything Capture all events.\n"));
fprintf(stderr, _(" --foreground Process events as soon as possible.\n"));
+ fprintf(stderr, _(" --no-autofsck Do not use the \"autofsck\" fs property to decide to repair.\n"));
fprintf(stderr, _(" --quiet Do not log health events to stdout.\n"));
fprintf(stderr, _(" --repair Always repair corrupt metadata.\n"));
fprintf(stderr, _(" --supported Check that health monitoring is supported.\n"));
@@ -422,6 +513,7 @@ enum long_opt_nr {
LOPT_EVERYTHING,
LOPT_FOREGROUND,
LOPT_HELP,
+ LOPT_NO_AUTOFSCK,
LOPT_QUIET,
LOPT_REPAIR,
LOPT_SUPPORTED,
@@ -439,6 +531,7 @@ main(
.conlock = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
.log = 1,
.mnt.fd = -1,
+ .autofsck = 1,
};
int option_index;
int vflag = 0;
@@ -455,6 +548,7 @@ main(
[LOPT_EVERYTHING] = {"everything", no_argument, &ctx.everything, 1 },
[LOPT_FOREGROUND] = {"foreground", no_argument, &ctx.foreground, 1 },
[LOPT_HELP] = {"help", no_argument, NULL, 0 },
+ [LOPT_NO_AUTOFSCK] = {"no-autofsck", no_argument, &ctx.autofsck, 0 },
[LOPT_QUIET] = {"quiet", no_argument, &ctx.log, 0 },
[LOPT_REPAIR] = {"repair", no_argument, &ctx.want_repair, 1 },
[LOPT_SUPPORTED] = {"supported", no_argument, &ctx.support_check, 1 },
@@ -492,6 +586,8 @@ main(
if (optind != argc - 1)
usage();
+ if (ctx.want_repair)
+ ctx.autofsck = 0;
ctx.mntpoint = argv[optind];
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 17/26] xfs_healer: run full scrub after lost corruption events or targeted repair failure
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (15 preceding siblings ...)
2026-03-19 4:42 ` [PATCH 16/26] xfs_healer: use the autofsck fsproperty to select mode Darrick J. Wong
@ 2026-03-19 4:43 ` Darrick J. Wong
2026-03-19 4:43 ` [PATCH 18/26] xfs_healer: use getmntent to find moved filesystems Darrick J. Wong
` (8 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:43 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
If we fail to perform a spot repair of metadata or the kernel tells us
that it lost corruption events due to queue limits, initiate a full run
of the online fsck service to try to fix the error.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 3 ++
healer/Makefile | 2 +
healer/fsrepair.c | 81 +++++++++++++++++++++++++++++++++++++++++++++-----
healer/weakhandle.c | 13 ++++++++
healer/xfs_healer.c | 7 ++++
include/builddefs.in | 1 +
scrub/Makefile | 7 ++--
7 files changed, 102 insertions(+), 12 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index a2a46053928e33..e1370323bbd66a 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -72,6 +72,7 @@ void lookup_path(struct healer_ctx *ctx,
int repair_metadata(struct healer_ctx *ctx, const struct hme_prefix *pfx,
const struct xfs_health_monitor_event *hme);
bool healer_can_repair(struct healer_ctx *ctx);
+void run_full_repair(struct healer_ctx *ctx);
/* weakhandle.c */
int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname,
@@ -80,5 +81,7 @@ int weakhandle_reopen(struct weakhandle *wh, int *fd);
void weakhandle_free(struct weakhandle **whp);
int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen,
char *path, size_t pathlen);
+int weakhandle_instance_unit_name(struct weakhandle *wh, const char *template,
+ char *unitname, size_t unitnamelen);
#endif /* XFS_HEALER_XFS_HEALER_H_ */
diff --git a/healer/Makefile b/healer/Makefile
index 1eeb727682008b..b8ffce33e90d18 100644
--- a/healer/Makefile
+++ b/healer/Makefile
@@ -19,6 +19,8 @@ xfs_healer.c
HFILES = \
xfs_healer.h
+CFLAGS+=-DXFS_SCRUB_SVCNAME=\"$(XFS_SCRUB_SVCNAME)\"
+
LLDLIBS += $(LIBHANDLE) $(LIBFROG) $(LIBURCU) $(LIBPTHREAD)
LTDEPENDENCIES += $(LIBHANDLE) $(LIBFROG)
LLDFLAGS = -static
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 4534104f8a6ac1..9f8c128e395ebc 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -9,8 +9,14 @@
#include "libfrog/fsgeom.h"
#include "libfrog/workqueue.h"
#include "libfrog/healthevent.h"
+#include "libfrog/systemd.h"
#include "xfs_healer.h"
+enum what_next {
+ NEED_FULL_REPAIR,
+ REPAIR_DONE,
+};
+
/* Translate scrub output flags to outcome. */
static enum repair_outcome from_repair_oflags(uint32_t oflags)
{
@@ -61,7 +67,7 @@ xfs_repair_metadata(
}
/* React to a fs-domain corruption event by repairing it. */
-static void
+static enum what_next
try_repair_wholefs(
struct healer_ctx *ctx,
const struct hme_prefix *pfx,
@@ -90,11 +96,16 @@ try_repair_wholefs(
pthread_mutex_lock(&ctx->conlock);
report_health_repair(pfx, hme, f->event_mask, outcome);
pthread_mutex_unlock(&ctx->conlock);
+
+ if (outcome == REPAIR_FAILED)
+ return NEED_FULL_REPAIR;
}
+
+ return REPAIR_DONE;
}
/* React to an ag corruption event by repairing it. */
-static void
+static enum what_next
try_repair_ag(
struct healer_ctx *ctx,
const struct hme_prefix *pfx,
@@ -126,11 +137,16 @@ try_repair_ag(
pthread_mutex_lock(&ctx->conlock);
report_health_repair(pfx, hme, f->event_mask, outcome);
pthread_mutex_unlock(&ctx->conlock);
+
+ if (outcome == REPAIR_FAILED)
+ return NEED_FULL_REPAIR;
}
+
+ return REPAIR_DONE;
}
/* React to a rtgroup corruption event by repairing it. */
-static void
+static enum what_next
try_repair_rtgroup(
struct healer_ctx *ctx,
const struct hme_prefix *pfx,
@@ -157,11 +173,16 @@ try_repair_rtgroup(
pthread_mutex_lock(&ctx->conlock);
report_health_repair(pfx, hme, f->event_mask, outcome);
pthread_mutex_unlock(&ctx->conlock);
+
+ if (outcome == REPAIR_FAILED)
+ return NEED_FULL_REPAIR;
}
+
+ return REPAIR_DONE;
}
/* React to a inode-domain corruption event by repairing it. */
-static void
+static enum what_next
try_repair_inode(
struct healer_ctx *ctx,
const struct hme_prefix *orig_pfx,
@@ -204,7 +225,12 @@ try_repair_inode(
pthread_mutex_lock(&ctx->conlock);
report_health_repair(pfx, hme, f->event_mask, outcome);
pthread_mutex_unlock(&ctx->conlock);
+
+ if (outcome == REPAIR_FAILED)
+ return NEED_FULL_REPAIR;
}
+
+ return REPAIR_DONE;
}
/* Repair a metadata corruption. */
@@ -214,6 +240,7 @@ repair_metadata(
const struct hme_prefix *pfx,
const struct xfs_health_monitor_event *hme)
{
+ enum what_next what_next;
int repair_fd;
int ret;
@@ -227,19 +254,25 @@ repair_metadata(
switch (hme->domain) {
case XFS_HEALTH_MONITOR_DOMAIN_FS:
- try_repair_wholefs(ctx, pfx, repair_fd, hme);
+ what_next = try_repair_wholefs(ctx, pfx, repair_fd, hme);
break;
case XFS_HEALTH_MONITOR_DOMAIN_AG:
- try_repair_ag(ctx, pfx, repair_fd, hme);
+ what_next = try_repair_ag(ctx, pfx, repair_fd, hme);
break;
case XFS_HEALTH_MONITOR_DOMAIN_RTGROUP:
- try_repair_rtgroup(ctx, pfx, repair_fd, hme);
+ what_next = try_repair_rtgroup(ctx, pfx, repair_fd, hme);
break;
case XFS_HEALTH_MONITOR_DOMAIN_INODE:
- try_repair_inode(ctx, pfx, repair_fd, hme);
+ what_next = try_repair_inode(ctx, pfx, repair_fd, hme);
break;
+ default:
+ what_next = REPAIR_DONE;
}
+ /* Transform into a full repair if we failed to fix this item. */
+ if (what_next == NEED_FULL_REPAIR)
+ run_full_repair(ctx);
+
close(repair_fd);
return 0;
}
@@ -259,3 +292,35 @@ healer_can_repair(
ret = ioctl(ctx->mnt.fd, XFS_IOC_SCRUB_METADATA, &sm);
return ret ? false : true;
}
+
+/* Run a full repair of the filesystem using the background fsck service. */
+void
+run_full_repair(
+ struct healer_ctx *ctx)
+{
+ char unitname[PATH_MAX];
+ int ret;
+
+ ret = weakhandle_instance_unit_name(ctx->wh, XFS_SCRUB_SVCNAME,
+ unitname, PATH_MAX);
+ if (ret) {
+ fprintf(stderr, "%s: %s\n", ctx->mntpoint,
+ _("Could not determine xfs_scrub unit name."));
+ return;
+ }
+
+ /*
+ * Scrub could already be repairing something, so try to start the unit
+ * and be content if it's already running.
+ */
+ ret = systemd_manage_unit(UM_START, unitname);
+ if (ret) {
+ fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
+ _("Could not start xfs_scrub service unit"),
+ unitname);
+ return;
+ }
+
+ printf("%s: %s\n", ctx->mntpoint, _("Full repairs in progress."));
+ fflush(stdout);
+}
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 8950e0eb1e5a43..849aa2882700d4 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -13,6 +13,7 @@
#include "libfrog/workqueue.h"
#include "libfrog/getparents.h"
#include "libfrog/paths.h"
+#include "libfrog/systemd.h"
#include "xfs_healer.h"
struct weakhandle {
@@ -199,3 +200,15 @@ weakhandle_getpath_for(
close(mnt_fd);
return ret;
}
+
+/* Compute the systemd instance unit name for this mountpoint. */
+int
+weakhandle_instance_unit_name(
+ struct weakhandle *wh,
+ const char *template,
+ char *unitname,
+ size_t unitnamelen)
+{
+ return systemd_path_instance_unit_name(template, wh->mntpoint,
+ unitname, unitnamelen);
+}
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index f4bee495979324..09b88c754a550c 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -138,6 +138,13 @@ handle_event(
pthread_mutex_unlock(&ctx->conlock);
}
+ /*
+ * If we didn't ask for all the metadata reports (including the healthy
+ * ones) and the kernel tells us it lost something, run the full scan.
+ */
+ if (hme->type == XFS_HEALTH_MONITOR_TYPE_LOST && !ctx->everything)
+ run_full_repair(ctx);
+
/* Initiate a repair if appropriate. */
if (will_repair)
repair_metadata(ctx, &pfx, hme);
diff --git a/include/builddefs.in b/include/builddefs.in
index bdba9cd9037900..3b52d1afd7031c 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -62,6 +62,7 @@ MKFS_CFG_DIR = @datadir@/@pkg_name@/mkfs
PKG_STATE_DIR = @localstatedir@/lib/@pkg_name@
XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP=$(PKG_STATE_DIR)/xfs_scrub_all_media.stamp
+XFS_SCRUB_SVCNAME=xfs_scrub@.service
CC = @cc@
BUILD_CC = @BUILD_CC@
diff --git a/scrub/Makefile b/scrub/Makefile
index ff79a265762332..aee49bfce100e2 100644
--- a/scrub/Makefile
+++ b/scrub/Makefile
@@ -8,7 +8,6 @@ include $(builddefs)
SCRUB_PREREQS=$(HAVE_GETFSMAP)
-scrub_svcname=xfs_scrub@.service
scrub_media_svcname=xfs_scrub_media@.service
ifeq ($(SCRUB_PREREQS),yes)
@@ -21,7 +20,7 @@ XFS_SCRUB_SERVICE_ARGS = -b -o autofsck
ifeq ($(HAVE_SYSTEMD),yes)
INSTALL_SCRUB += install-systemd
SYSTEMD_SERVICES=\
- $(scrub_svcname) \
+ $(XFS_SCRUB_SVCNAME) \
xfs_scrub_fail@.service \
$(scrub_media_svcname) \
xfs_scrub_media_fail@.service \
@@ -123,7 +122,7 @@ xfs_scrub_all.timer: xfs_scrub_all.timer.in $(builddefs)
$(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/gettext.py
@echo " [SED] $@"
$(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
- -e "s|@scrub_svcname@|$(scrub_svcname)|g" \
+ -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \
-e "s|@scrub_media_svcname@|$(scrub_media_svcname)|g" \
-e "s|@pkg_version@|$(PKG_VERSION)|g" \
-e "s|@stampfile@|$(XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_STAMP)|g" \
@@ -137,7 +136,7 @@ $(XFS_SCRUB_ALL_PROG): $(XFS_SCRUB_ALL_PROG).in $(builddefs) $(TOPDIR)/libfrog/g
xfs_scrub_fail: xfs_scrub_fail.in $(builddefs)
@echo " [SED] $@"
$(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
- -e "s|@scrub_svcname@|$(scrub_svcname)|g" \
+ -e "s|@scrub_svcname@|$(XFS_SCRUB_SVCNAME)|g" \
-e "s|@pkg_version@|$(PKG_VERSION)|g" < $< > $@
$(Q)chmod a+x $@
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 18/26] xfs_healer: use getmntent to find moved filesystems
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (16 preceding siblings ...)
2026-03-19 4:43 ` [PATCH 17/26] xfs_healer: run full scrub after lost corruption events or targeted repair failure Darrick J. Wong
@ 2026-03-19 4:43 ` Darrick J. Wong
2026-03-19 4:43 ` [PATCH 19/26] xfs_healer: use statmount to find moved filesystems even faster Darrick J. Wong
` (7 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:43 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
It's possible that a mounted filesystem can move mountpoints between the
time of the initial mount (at which point xfs_healer starts) and when
it actually wants to start a repair. When this happens,
weakhandle::mountpoint becomes obsolete and opening it will either fail
with ENOENT or the handle revalidation will return ESTALE.
However, we do still have a means to find the mounted filesystem -- the
fsname parameter (aka the path to the data device at mount time). This
is record in /proc/mounts, which means that we can iterate getmntent to
see if we can find the mount elsewhere.
As documented a few patches ago, this would be easier if we had
revocable fds that didn't pin mounts, but that's a very huge ask.
This getmntent code enables xfs_healer to find a filesystem that has
been bind mounted in a new place and the original mountpoint detached:
# mount /dev/sda /mnt
# xfs_healer /mnt &
# mount /mnt /opt --bind
# umount /mnt
The key here is that each bind mount gets a separate struct mount
object.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/weakhandle.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 51 insertions(+), 4 deletions(-)
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 849aa2882700d4..5df5207514e38e 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -65,10 +65,14 @@ weakhandle_alloc(
return -1;
}
-/* Reopen a file handle obtained via weak reference. */
-int
-weakhandle_reopen(
+/*
+ * Reopen a file handle obtained via weak reference, using the given path to a
+ * mount point.
+ */
+static int
+weakhandle_reopen_from(
struct weakhandle *wh,
+ const char *path,
int *fd)
{
void *hanp;
@@ -78,7 +82,7 @@ weakhandle_reopen(
*fd = -1;
- mnt_fd = open(wh->mntpoint, O_RDONLY);
+ mnt_fd = open(path, O_RDONLY);
if (mnt_fd < 0)
return -1;
@@ -102,6 +106,49 @@ weakhandle_reopen(
return -1;
}
+/* Reopen a file handle obtained via weak reference. */
+int
+weakhandle_reopen(
+ struct weakhandle *wh,
+ int *fd)
+{
+ FILE *mtab;
+ struct mntent *mnt;
+ int ret;
+
+ /* First try reopening using the original mountpoint */
+ ret = weakhandle_reopen_from(wh, wh->mntpoint, fd);
+ if (!ret)
+ return 0;
+
+ /*
+ * That didn't work, so now walk /proc/mounts to find a mount with the
+ * same fsname (aka xfs data device path) as when we started.
+ */
+ mtab = setmntent(_PATH_PROC_MOUNTS, "r");
+ if (!mtab)
+ return -1;
+
+ while ((mnt = getmntent(mtab)) != NULL) {
+ if (strcmp(mnt->mnt_type, "xfs"))
+ continue;
+ if (strcmp(mnt->mnt_fsname, wh->fsname))
+ continue;
+
+ ret = weakhandle_reopen_from(wh, mnt->mnt_dir, fd);
+ if (!ret)
+ break;
+ }
+
+ if (*fd < 0) {
+ errno = ESTALE;
+ ret = -1;
+ }
+
+ endmntent(mtab);
+ return ret;
+}
+
/* Tear down a weak handle */
void
weakhandle_free(
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 19/26] xfs_healer: use statmount to find moved filesystems even faster
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (17 preceding siblings ...)
2026-03-19 4:43 ` [PATCH 18/26] xfs_healer: use getmntent to find moved filesystems Darrick J. Wong
@ 2026-03-19 4:43 ` Darrick J. Wong
2026-03-20 7:11 ` Christoph Hellwig
2026-03-19 4:43 ` [PATCH 20/26] xfs_healer: validate that repair fds point to the monitored fs Darrick J. Wong
` (6 subsequent siblings)
25 siblings, 1 reply; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:43 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
As noted in the previous patch, it's possible that a mounted filesystem
can move mountpoints between the time of the initial mount (at which
point xfs_healer starts) and when it actually wants to start a repair.
The previous patch fixed that problem by using getmntent to walk
/proc/self/mounts to see if it finds a mount with the same "source"
name, aka data device.
However, this is really slow if there are a lot of filesystems because
we end up wading through a lot of irrelevant information. However,
statmount() can help us here because as of Linux 7.0 we can open the
passed-in path at startup, call statmount() on it to retrieve the
mnt_id, and then call it again later with that same mnt_id to find the
mountpoint. Luckily xfs_healthmon didn't get merged until 7.0 so it's
more or less guaranteed to be there if XFS_IOC_HEALTH_MONITOR succeeds.
Obviously if this doesn't work, we can fall back to the slow walk.
This statmount code enables xfs_healer to find a filesystem that has
had its mountpoint moved to a different place in the directory tree
without the use of bind mounts and without needing to walk the entire
mount list:
# mount -t tmpfs urk /mnt
# mount --make-rprivate /mnt
# mkdir -p /mnt/a /mnt/b
# mount /dev/sda /mnt/a
# mount --move /mnt/a /mnt/b
The key here is that the struct mount object is moved, and no new ones
are created. Therefore, the original mnt_id is still usable.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/xfs_healer.h | 7 +++++--
healer/weakhandle.c | 24 ++++++++++++++++++++++++
healer/xfs_healer.c | 37 +++++++++++++++++++++++++++++++++++--
3 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index e1370323bbd66a..96e146f266629a 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -39,6 +39,9 @@ struct healer_ctx {
/* Shared reference to the getmntent fsname for reconnecting */
const char *fsname;
+ /* Mount id for faster reconnecting */
+ uint64_t mnt_id;
+
/* weak file handle so we can reattach to filesystem */
struct weakhandle *wh;
@@ -75,8 +78,8 @@ bool healer_can_repair(struct healer_ctx *ctx);
void run_full_repair(struct healer_ctx *ctx);
/* weakhandle.c */
-int weakhandle_alloc(int fd, const char *mountpoint, const char *fsname,
- struct weakhandle **whp);
+int weakhandle_alloc(int fd, const char *mountpoint, uint64_t mnt_id,
+ const char *fsname, struct weakhandle **whp);
int weakhandle_reopen(struct weakhandle *wh, int *fd);
void weakhandle_free(struct weakhandle **whp);
int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen,
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 5df5207514e38e..358c553f883f3d 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -14,6 +14,7 @@
#include "libfrog/getparents.h"
#include "libfrog/paths.h"
#include "libfrog/systemd.h"
+#include "libfrog/statmount.h"
#include "xfs_healer.h"
struct weakhandle {
@@ -23,6 +24,9 @@ struct weakhandle {
/* Shared reference to the getmntent fsname for reconnecting */
const char *fsname;
+ /* Mount id for faster reconnecting */
+ uint64_t mnt_id;
+
/* handle to root dir */
void *hanp;
size_t hlen;
@@ -33,6 +37,7 @@ int
weakhandle_alloc(
int fd,
const char *mountpoint,
+ uint64_t mnt_id,
const char *fsname,
struct weakhandle **whp)
{
@@ -51,6 +56,7 @@ weakhandle_alloc(
return -1;
wh->mntpoint = mountpoint;
+ wh->mnt_id = mnt_id;
wh->fsname = fsname;
ret = fd_to_handle(fd, &wh->hanp, &wh->hlen);
@@ -112,6 +118,9 @@ weakhandle_reopen(
struct weakhandle *wh,
int *fd)
{
+ const size_t smbuf_size =
+ libfrog_statmount_sizeof(PATH_MAX);
+ struct statmount *smbuf = alloca(smbuf_size);
FILE *mtab;
struct mntent *mnt;
int ret;
@@ -121,6 +130,21 @@ weakhandle_reopen(
if (!ret)
return 0;
+ /*
+ * The original mountpoint didn't work, which means the mount might
+ * have been moved. Look up the mountpoint for the mount id that we
+ * captured earlier, which is a quick lookup if there are many mounts.
+ * Note that @ret is nonzero here.
+ */
+ ret = libfrog_statmount(wh->mnt_id, DEFAULT_MOUNTNS_FD,
+ STATMOUNT_MNT_POINT, smbuf, smbuf_size);
+ if (ret || !(smbuf->mask & STATMOUNT_MNT_POINT))
+ goto fallback;
+ ret = weakhandle_reopen_from(wh, smbuf->str + smbuf->mnt_point, fd);
+ if (!ret)
+ return 0;
+
+fallback:
/*
* That didn't work, so now walk /proc/mounts to find a mount with the
* same fsname (aka xfs data device path) as when we started.
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 09b88c754a550c..b91d7f16774b75 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -15,6 +15,7 @@
#include "libfrog/workqueue.h"
#include "libfrog/systemd.h"
#include "libfrog/fsproperties.h"
+#include "libfrog/statmount.h"
#include "xfs_healer.h"
/* Program name; needed for libfrog error reports. */
@@ -163,11 +164,43 @@ try_capture_fsinfo(
{
struct mntent *mnt;
FILE *mtp;
- char rpath[PATH_MAX], rmnt_dir[PATH_MAX];
+ const size_t smbuf_size =
+ libfrog_statmount_sizeof(PATH_MAX + 128);
+ struct statmount *smbuf = alloca(smbuf_size);
+ char *rmnt_dir = smbuf->str;
+ char rpath[PATH_MAX];
+ int ret;
if (!realpath(ctx->mntpoint, rpath))
return -1;
+ /*
+ * In Linux 7.0 we can do statmount on an open file, which means that
+ * we can capture the mnt_id, mount point, and fsname, which can help
+ * us find a mount --move'd elsewhere in the directory tree.
+ */
+ ret = libfrog_fstatmount(ctx->mnt.fd, STATMOUNT_MNT_POINT, smbuf,
+ smbuf_size);
+ if (ret || !(smbuf->mask & STATMOUNT_MNT_POINT))
+ goto fallback;
+ if (strcmp(rpath, smbuf->str + smbuf->mnt_point))
+ goto fallback;
+
+ ret = libfrog_fstatmount(ctx->mnt.fd,
+ STATMOUNT_SB_SOURCE | STATMOUNT_MNT_BASIC,
+ smbuf, smbuf_size);
+ if (ret || !(smbuf->mask & STATMOUNT_SB_SOURCE))
+ goto fallback;
+
+ ctx->fsname = strdup(smbuf->str + smbuf->sb_source);
+ ctx->mnt_id = smbuf->mnt_id;
+ return 0;
+
+fallback:
+ /*
+ * If statmount isn't available for whatever reason, fall back to
+ * walking the mount table via getmntent.
+ */
mtp = setmntent(_PATH_PROC_MOUNTS, "r");
if (mtp == NULL)
return -1;
@@ -341,7 +374,7 @@ setup_monitor(
* paths for logging.
*/
if (ctx->want_repair || healer_has_parent(ctx)) {
- ret = weakhandle_alloc(ctx->mnt.fd, ctx->mntpoint,
+ ret = weakhandle_alloc(ctx->mnt.fd, ctx->mntpoint, ctx->mnt_id,
ctx->fsname, &ctx->wh);
if (ret) {
fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 20/26] xfs_healer: validate that repair fds point to the monitored fs
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (18 preceding siblings ...)
2026-03-19 4:43 ` [PATCH 19/26] xfs_healer: use statmount to find moved filesystems even faster Darrick J. Wong
@ 2026-03-19 4:43 ` Darrick J. Wong
2026-03-19 4:44 ` [PATCH 21/26] xfs_healer: add a manual page Darrick J. Wong
` (5 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:43 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
When xfs_healer reopens a mountpoint to perform a repair, it should
validate that the opened fd points to a file on the same filesystem as
the one being monitored.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
healer/xfs_healer.h | 4 +++-
healer/fsrepair.c | 18 +++++++++++++++++-
healer/weakhandle.c | 23 +++++++++++++++++------
3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/healer/xfs_healer.h b/healer/xfs_healer.h
index 96e146f266629a..c6692375dda6bf 100644
--- a/healer/xfs_healer.h
+++ b/healer/xfs_healer.h
@@ -80,7 +80,9 @@ void run_full_repair(struct healer_ctx *ctx);
/* weakhandle.c */
int weakhandle_alloc(int fd, const char *mountpoint, uint64_t mnt_id,
const char *fsname, struct weakhandle **whp);
-int weakhandle_reopen(struct weakhandle *wh, int *fd);
+typedef bool (*weakhandle_fd_t)(int mnt_fd, void *data);
+int weakhandle_reopen(struct weakhandle *wh, int *fd,
+ weakhandle_fd_t is_acceptable, void *data);
void weakhandle_free(struct weakhandle **whp);
int weakhandle_getpath_for(struct weakhandle *wh, uint64_t ino, uint32_t gen,
char *path, size_t pathlen);
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 9f8c128e395ebc..002e5e78fcf22e 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -233,6 +233,22 @@ try_repair_inode(
return REPAIR_DONE;
}
+/* Make sure the reopened file is on the same fs as the monitor. */
+static bool
+is_same_fs(
+ int mnt_fd,
+ void *data)
+{
+ struct xfs_health_file_on_monitored_fs hms = {
+ .fd = mnt_fd,
+ };
+ FILE *mon_fp = data;
+ int ret;
+
+ ret = ioctl(fileno(mon_fp), XFS_IOC_HEALTH_FD_ON_MONITORED_FS, &hms);
+ return ret == 0;
+}
+
/* Repair a metadata corruption. */
int
repair_metadata(
@@ -244,7 +260,7 @@ repair_metadata(
int repair_fd;
int ret;
- ret = weakhandle_reopen(ctx->wh, &repair_fd);
+ ret = weakhandle_reopen(ctx->wh, &repair_fd, is_same_fs, ctx->mon_fp);
if (ret) {
fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
_("cannot open filesystem to repair"),
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 358c553f883f3d..7b8cef0a63f971 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -79,7 +79,9 @@ static int
weakhandle_reopen_from(
struct weakhandle *wh,
const char *path,
- int *fd)
+ int *fd,
+ weakhandle_fd_t is_acceptable,
+ void *data)
{
void *hanp;
size_t hlen;
@@ -101,6 +103,11 @@ weakhandle_reopen_from(
goto out_handle;
}
+ if (is_acceptable && !is_acceptable(mnt_fd, data)) {
+ errno = ESTALE;
+ goto out_handle;
+ }
+
free_handle(hanp, hlen);
*fd = mnt_fd;
return 0;
@@ -116,7 +123,9 @@ weakhandle_reopen_from(
int
weakhandle_reopen(
struct weakhandle *wh,
- int *fd)
+ int *fd,
+ weakhandle_fd_t is_acceptable,
+ void *data)
{
const size_t smbuf_size =
libfrog_statmount_sizeof(PATH_MAX);
@@ -126,7 +135,7 @@ weakhandle_reopen(
int ret;
/* First try reopening using the original mountpoint */
- ret = weakhandle_reopen_from(wh, wh->mntpoint, fd);
+ ret = weakhandle_reopen_from(wh, wh->mntpoint, fd, is_acceptable, data);
if (!ret)
return 0;
@@ -140,7 +149,8 @@ weakhandle_reopen(
STATMOUNT_MNT_POINT, smbuf, smbuf_size);
if (ret || !(smbuf->mask & STATMOUNT_MNT_POINT))
goto fallback;
- ret = weakhandle_reopen_from(wh, smbuf->str + smbuf->mnt_point, fd);
+ ret = weakhandle_reopen_from(wh, smbuf->str + smbuf->mnt_point, fd,
+ is_acceptable, data);
if (!ret)
return 0;
@@ -159,7 +169,8 @@ weakhandle_reopen(
if (strcmp(mnt->mnt_fsname, wh->fsname))
continue;
- ret = weakhandle_reopen_from(wh, mnt->mnt_dir, fd);
+ ret = weakhandle_reopen_from(wh, mnt->mnt_dir, fd,
+ is_acceptable, data);
if (!ret)
break;
}
@@ -244,7 +255,7 @@ weakhandle_getpath_for(
fakehandle.ha_fid.fid_ino = ino;
fakehandle.ha_fid.fid_gen = gen;
- ret = weakhandle_reopen(wh, &mnt_fd);
+ ret = weakhandle_reopen(wh, &mnt_fd, NULL, NULL);
if (ret)
return ret;
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 21/26] xfs_healer: add a manual page
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (19 preceding siblings ...)
2026-03-19 4:43 ` [PATCH 20/26] xfs_healer: validate that repair fds point to the monitored fs Darrick J. Wong
@ 2026-03-19 4:44 ` Darrick J. Wong
2026-03-19 4:44 ` [PATCH 22/26] xfs_scrub: print systemd service names Darrick J. Wong
` (4 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:44 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add a new section 8 manpage for this service daemon so others can read
about what this program is supposed to do.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
man/man8/Makefile | 40 +++++++++++++---
man/man8/xfs_healer.8 | 109 +++++++++++++++++++++++++++++++++++++++++++
man/man8/xfs_healer_start.8 | 37 +++++++++++++++
3 files changed, 180 insertions(+), 6 deletions(-)
create mode 100644 man/man8/xfs_healer.8
create mode 100644 man/man8/xfs_healer_start.8
diff --git a/man/man8/Makefile b/man/man8/Makefile
index 5be76ab727a1fe..05710f85ae89ad 100644
--- a/man/man8/Makefile
+++ b/man/man8/Makefile
@@ -7,13 +7,41 @@ include $(TOPDIR)/include/builddefs
MAN_SECTION = 8
-ifneq ("$(ENABLE_SCRUB)","yes")
- MAN_PAGES = $(filter-out xfs_scrub%,$(shell echo *.$(MAN_SECTION)))
-else
- MAN_PAGES = $(shell echo *.$(MAN_SECTION))
- MAN_PAGES += xfs_scrub_all.8
+MAN_PAGES = \
+ fsck.xfs.8 \
+ mkfs.xfs.8 \
+ xfs_admin.8 \
+ xfs_bmap.8 \
+ xfs_copy.8 \
+ xfs_db.8 \
+ xfs_estimate.8 \
+ xfs_freeze.8 \
+ xfs_fsr.8 \
+ xfs_growfs.8 \
+ xfs_info.8 \
+ xfs_io.8 \
+ xfs_logprint.8 \
+ xfs_mdrestore.8 \
+ xfs_metadump.8 \
+ xfs_mkfile.8 \
+ xfs_ncheck.8 \
+ xfs_property.8 \
+ xfs_protofile.8 \
+ xfs_quota.8 \
+ xfs_repair.8 \
+ xfs_rtcp.8 \
+ xfs_spaceman.8
+
+ifeq ($(ENABLE_HEALER),yes)
+ MAN_PAGES += xfs_healer.8
endif
-MAN_PAGES += mkfs.xfs.8
+ifeq ($(HAVE_HEALER_START_DEPS),yes)
+ MAN_PAGES += xfs_healer_start.8
+endif
+ifeq ($(ENABLE_SCRUB),yes)
+ MAN_PAGES += xfs_scrub.8 xfs_scrub_all.8
+endif
+
MAN_DEST = $(PKG_MAN_DIR)/man$(MAN_SECTION)
LSRCFILES = $(MAN_PAGES)
DIRT = mkfs.xfs.8 xfs_scrub_all.8
diff --git a/man/man8/xfs_healer.8 b/man/man8/xfs_healer.8
new file mode 100644
index 00000000000000..eea799f7811a4d
--- /dev/null
+++ b/man/man8/xfs_healer.8
@@ -0,0 +1,109 @@
+.TH xfs_healer 8
+.SH NAME
+xfs_healer \- automatically heal damage to XFS filesystem metadata
+.SH SYNOPSIS
+.B xfs_healer
+[
+.B OPTIONS
+]
+.I mount-point
+.br
+.B xfs_healer \-V
+.SH DESCRIPTION
+.B xfs_healer
+is a daemon that tries to automatically repair damaged XFS filesystem metadata.
+.PP
+.B WARNING!
+This program is
+.BR EXPERIMENTAL ","
+which means that its behavior and interface
+could change at any time!
+.PP
+.B xfs_healer
+asks the kernel to report all observations of corrupt metadata, media errors,
+filesystem shutdowns, and file I/O errors.
+The program can respond to runtime metadata corruption errors by initiating
+targeted repairs of the suspect metadata or a full online fsck of the
+filesystem.
+
+Normally this program runs as a systemd service.
+The service is activated via the
+.I xfs_healer_start
+service if systemd is supported.
+
+The kernel may not support repairing or optimizing the filesystem.
+If this is the case, the filesystem must be unmounted and
+.BR xfs_repair (8)
+run on the filesystem to fix the problems.
+.SH OPTIONS
+.TP
+.BI \-\-everything
+Ask the kernel to send us good metadata health events, not only events related
+to metadata corruption, media errors, shutdowns, and I/O errors.
+.TP
+.B \-\-foreground
+Start enough event handling threads to allow consumption of all online CPUs.
+If not specified, start exactly one event handling thread.
+.TP
+.B \-\-no-autofsck
+Do not use the
+.I autofsck
+filesystem property to decide whether or not to repair corrupt metadata.
+If the
+.B \-\-repair
+option is given, then all corruptions will be repaired.
+If the
+.B \-\-repair
+option is not given, then the program will never try to repair the filesystem.
+.TP
+.B \-\-quiet
+Do not print every event to standard output.
+.TP
+.B \-\-repair
+Always try to repair each piece of corrupt metadata when the kernel tells us
+about it.
+If an individual repair fails or the kernel tells us that health events were
+lost, the
+.I xfs_scrub
+service for this mount point will be launched.
+The default is not to try to repair anything.
+If this option is specified but the kernel does not support repairs, the
+program will exit.
+.TP
+.B \-\-supported
+Check if the filesystem supports sending health events.
+Exits with 0 if it does, and non-zero if not.
+.TP
+.BI \-V
+Prints the version number and exit.
+
+.SH AUTOFSCK
+By default, this program will read the
+.I autofsck
+filesystem property to decide if it should try to repair corruptions.
+If the property is set to the value
+.B repair
+then corruptions will be repaired.
+If the property is not set but the filesystem supports all back-reference
+metadata (reverse mappings and parent pointers), then corruptions will be
+repaired.
+
+See the
+.BR xfs_scrub (8)
+manual page for more details on this filesystem property.
+
+.SH CAVEATS
+.B xfs_healer
+is an immature utility!
+Do not run this program unless you have backups of your data!
+This program takes advantage of in-kernel scrubbing to verify a given
+data structure with locks held and can keep the filesystem busy for a
+long time.
+The kernel must be new enough to support the SCRUB_METADATA ioctl.
+.PP
+If errors are found and cannot be repaired, the filesystem must be
+unmounted and repaired.
+.SH SEE ALSO
+.BR xfs_repair (8)
+and
+.BR xfs_scrub (8).
diff --git a/man/man8/xfs_healer_start.8 b/man/man8/xfs_healer_start.8
new file mode 100644
index 00000000000000..9e424432a513fe
--- /dev/null
+++ b/man/man8/xfs_healer_start.8
@@ -0,0 +1,37 @@
+.TH xfs_healer_start 8
+.SH NAME
+xfs_healer_start \- starts xfs_healer instances
+.SH SYNOPSIS
+.B xfs_healer_start
+[
+.B OPTIONS
+]
+.br
+.B xfs_healer \-V
+.SH DESCRIPTION
+.B xfs_healer_start
+starts the xfs_healer service whenever the kernel mounts an XFS filesystem in
+the current mount namespace.
+.PP
+.B WARNING!
+This program is
+.BR EXPERIMENTAL ","
+which means that its behavior and interface
+could change at any time!
+
+Normally this program runs as a systemd service.
+
+.SH OPTIONS
+.TP
+.B \-\-supported
+Check if the kernel supports listening for mount events.
+Exits with 0 if it does, and non-zero if not.
+.TP
+.BI "\-\-mountns " path
+Monitor the given mount namespace.
+Defaults to the mount namespace associated with the process itself.
+.TP
+.BI \-V
+Prints the version number and exit.
+.SH SEE ALSO
+.BR xfs_healer (8).
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 22/26] xfs_scrub: print systemd service names
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (20 preceding siblings ...)
2026-03-19 4:44 ` [PATCH 21/26] xfs_healer: add a manual page Darrick J. Wong
@ 2026-03-19 4:44 ` Darrick J. Wong
2026-03-19 4:44 ` [PATCH 23/26] xfs_io: add listmount and statmount commands Darrick J. Wong
` (3 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:44 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add a hidden switch to xfs_scrub to emit systemd service names for XFS
services targetting filesystems paths instead of opencoding the
computation in things like fstests.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
scrub/xfs_scrub.h | 3 +++
scrub/Makefile | 7 +++++--
scrub/xfs_scrub.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 54 insertions(+), 6 deletions(-)
diff --git a/scrub/xfs_scrub.h b/scrub/xfs_scrub.h
index 6ee359f4cebd47..041c0fadaa93c0 100644
--- a/scrub/xfs_scrub.h
+++ b/scrub/xfs_scrub.h
@@ -108,6 +108,9 @@ struct scrub_ctx {
* this much space per volume.
*/
double fstrim_block_pct;
+
+ /* CLI options, must be int */
+ int print_svcname;
};
/*
diff --git a/scrub/Makefile b/scrub/Makefile
index aee49bfce100e2..4aa0a7d836c342 100644
--- a/scrub/Makefile
+++ b/scrub/Makefile
@@ -8,9 +8,12 @@ include $(builddefs)
SCRUB_PREREQS=$(HAVE_GETFSMAP)
-scrub_media_svcname=xfs_scrub_media@.service
+XFS_SCRUB_MEDIA_SVCNAME=xfs_scrub_media@.service
ifeq ($(SCRUB_PREREQS),yes)
+CFLAGS+=-DXFS_SCRUB_SVCNAME=\"$(XFS_SCRUB_SVCNAME)\"
+CFLAGS+=-DXFS_SCRUB_MEDIA_SVCNAME=\"$(XFS_SCRUB_MEDIA_SVCNAME)\"
+
LTCOMMAND = xfs_scrub
INSTALL_SCRUB = install-scrub
XFS_SCRUB_ALL_PROG = xfs_scrub_all.py
@@ -22,7 +25,7 @@ INSTALL_SCRUB += install-systemd
SYSTEMD_SERVICES=\
$(XFS_SCRUB_SVCNAME) \
xfs_scrub_fail@.service \
- $(scrub_media_svcname) \
+ $(XFS_SCRUB_MEDIA_SVCNAME) \
xfs_scrub_media_fail@.service \
xfs_scrub_all.service \
xfs_scrub_all_fail.service \
diff --git a/scrub/xfs_scrub.c b/scrub/xfs_scrub.c
index 79937aa8cce4c4..b74dc1635141aa 100644
--- a/scrub/xfs_scrub.c
+++ b/scrub/xfs_scrub.c
@@ -710,6 +710,13 @@ parse_o_opts(
}
}
+enum long_opt_nr {
+ LOPT_HELP,
+ LOPT_SVCNAME,
+
+ LOPT_MAX,
+};
+
int
main(
int argc,
@@ -717,11 +724,15 @@ main(
{
struct scrub_ctx ctx = {
.fstrim_block_pct = FSTRIM_BLOCK_PCT_DEFAULT,
+ .lock = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER,
+ .mode = SCRUB_MODE_REPAIR,
+ .error_action = ERRORS_CONTINUE,
};
struct phase_rusage all_pi;
char *mtab = NULL;
FILE *progress_fp = NULL;
struct fs_path *fsp;
+ int option_index;
int vflag = 0;
int c;
int fd;
@@ -742,11 +753,25 @@ main(
goto out_unicrash;
}
- pthread_mutex_init(&ctx.lock, NULL);
- ctx.mode = SCRUB_MODE_REPAIR;
- ctx.error_action = ERRORS_CONTINUE;
- while ((c = getopt(argc, argv, "a:bC:de:kM:m:no:pTvxV")) != EOF) {
+ struct option long_options[] = {
+ [LOPT_HELP] = {"help", no_argument, NULL, 0 },
+ [LOPT_SVCNAME] = {"svcname", no_argument, &ctx.print_svcname, 1 },
+
+ [LOPT_MAX] = {NULL, 0, NULL, 0 },
+ };
+
+ while ((c = getopt_long(argc, argv, "a:bC:de:kM:m:no:pTvxV",
+ long_options, &option_index)) != EOF) {
switch (c) {
+ case 0:
+ switch (option_index) {
+ case LOPT_HELP:
+ usage();
+ break;
+ default:
+ break;
+ }
+ break;
case 'a':
ctx.max_errors = cvt_u64(optarg, 10);
if (errno) {
@@ -860,6 +885,23 @@ main(
if (!ctx.actual_mntpoint)
ctx.actual_mntpoint = ctx.mntpoint;
+ if (ctx.print_svcname) {
+ char unitname[PATH_MAX];
+ const char *template =
+ scrub_data ? XFS_SCRUB_MEDIA_SVCNAME :
+ XFS_SCRUB_SVCNAME;
+
+ ret = systemd_path_instance_unit_name(template,
+ ctx.mntpoint, unitname, sizeof(unitname));
+ if (ret) {
+ perror(ctx.mntpoint);
+ return EXIT_FAILURE;
+ }
+
+ printf("%s\n", unitname);
+ return EXIT_SUCCESS;
+ }
+
stdout_isatty = isatty(STDOUT_FILENO);
stderr_isatty = isatty(STDERR_FILENO);
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 23/26] xfs_io: add listmount and statmount commands
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (21 preceding siblings ...)
2026-03-19 4:44 ` [PATCH 22/26] xfs_scrub: print systemd service names Darrick J. Wong
@ 2026-03-19 4:44 ` Darrick J. Wong
2026-03-19 4:45 ` [PATCH 24/26] mkfs: enable online repair if all backrefs are enabled Darrick J. Wong
` (2 subsequent siblings)
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:44 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Add two new commands: one to list all mounts via statmount, now that we
use this in xfs_healer_start, and another to statmount each open file.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
io/io.h | 6 +
io/Makefile | 5 +
io/init.c | 1
io/listmount.c | 361 +++++++++++++++++++++++++++++++++++++++++++++++++++++
man/man8/xfs_io.8 | 66 ++++++++++
5 files changed, 439 insertions(+)
create mode 100644 io/listmount.c
diff --git a/io/io.h b/io/io.h
index 0f12b3cfed5e76..5f1f278d14a033 100644
--- a/io/io.h
+++ b/io/io.h
@@ -164,3 +164,9 @@ void fsprops_init(void);
void aginfo_init(void);
void healthmon_init(void);
void verifymedia_init(void);
+
+#ifdef HAVE_LISTMOUNT
+void listmount_init(void);
+#else
+# define listmount_init() do { } while (0)
+#endif
diff --git a/io/Makefile b/io/Makefile
index 79d5e172b8f31f..e25742b635396e 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -90,6 +90,11 @@ ifeq ($(HAVE_GETFSMAP),yes)
CFILES += fsmap.c
endif
+ifeq ($(HAVE_LISTMOUNT),yes)
+CFILES += listmount.c
+LCFLAGS += -DHAVE_LISTMOUNT
+endif
+
default: depend $(LTCOMMAND)
include $(BUILDRULES)
diff --git a/io/init.c b/io/init.c
index f2a551ef559200..ba60cb2199639b 100644
--- a/io/init.c
+++ b/io/init.c
@@ -94,6 +94,7 @@ init_commands(void)
fsprops_init();
healthmon_init();
verifymedia_init();
+ listmount_init();
}
/*
diff --git a/io/listmount.c b/io/listmount.c
new file mode 100644
index 00000000000000..af4ebaf7861250
--- /dev/null
+++ b/io/listmount.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+
+#include "libfrog/flagmap.h"
+#include "libfrog/statmount.h"
+#include "command.h"
+#include "input.h"
+#include "init.h"
+#include "io.h"
+
+static const struct flag_map statmount_funcs[] = {
+ { STATMOUNT_SB_BASIC, N_("sb_basic") },
+ { STATMOUNT_MNT_BASIC, N_("mnt_basic") },
+ { STATMOUNT_PROPAGATE_FROM, N_("propagate_from") },
+ { STATMOUNT_MNT_ROOT, N_("mnt_root") },
+ { STATMOUNT_MNT_POINT, N_("mnt_point") },
+ { STATMOUNT_FS_TYPE, N_("fs_type") },
+ { STATMOUNT_MNT_NS_ID, N_("mnt_ns_id") },
+ { STATMOUNT_MNT_OPTS, N_("mnt_opts") },
+ { STATMOUNT_FS_SUBTYPE, N_("fs_subtype") },
+ { STATMOUNT_SB_SOURCE, N_("sb_source") },
+ { STATMOUNT_OPT_ARRAY, N_("opt_array") },
+ { STATMOUNT_OPT_SEC_ARRAY, N_("opt_sec_array") },
+ { STATMOUNT_SUPPORTED_MASK, N_("supported_mask") },
+ {0, NULL},
+};
+
+static const struct flag_map mount_attrs[] = {
+ { MOUNT_ATTR_RDONLY, N_("rdonly") },
+ { MOUNT_ATTR_NOSUID, N_("nosuid") },
+ { MOUNT_ATTR_NODEV, N_("nodev") },
+ { MOUNT_ATTR_NOEXEC, N_("noexec") },
+ { MOUNT_ATTR__ATIME, N_("atime") },
+ { MOUNT_ATTR_RELATIME, N_("relatime") },
+ { MOUNT_ATTR_NOATIME, N_("noatime") },
+ { MOUNT_ATTR_STRICTATIME, N_("strictatime") },
+ { MOUNT_ATTR_NODIRATIME, N_("nodiratime") },
+ { MOUNT_ATTR_IDMAP, N_("idmap") },
+ { MOUNT_ATTR_NOSYMFOLLOW, N_("nosymfollow") },
+ {0, NULL},
+};
+
+static const struct flag_map mount_prop_flags[] = {
+ { MS_SHARED, N_("shared") },
+ { MS_SLAVE, N_("nopeer") },
+ { MS_PRIVATE, N_("private") },
+ { MS_UNBINDABLE, N_("unbindable") },
+ {0, NULL},
+};
+
+static void
+dump_mountinfo(
+ const struct statmount *smbuf,
+ bool rawflag)
+{
+ char buf[4096];
+
+ if (rawflag) {
+ printf("\tmask: 0x%llx\n", (unsigned long long)smbuf->mask);
+ } else {
+ mask_to_string(statmount_funcs, smbuf->mask, ",", buf,
+ sizeof(buf));
+ printf("\tmask: {%s}\n", buf);
+ }
+
+ if (smbuf->mask & STATMOUNT_SB_BASIC) {
+ printf("\tsb_dev_major: %u\n", smbuf->sb_dev_major);
+ printf("\tsb_dev_minor: %u\n", smbuf->sb_dev_minor);
+ printf("\tsb_magic: 0x%llx\n",
+ (unsigned long long)smbuf->sb_magic);
+ printf("\tsb_flags: 0x%x\n", smbuf->sb_flags);
+ }
+
+ if (smbuf->mask & STATMOUNT_MNT_BASIC) {
+ printf("\tmnt_id: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_id);
+ printf("\tmnt_parent_id: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_parent_id);
+ printf("\tmnt_id_old: %u\n", smbuf->mnt_id_old);
+ printf("\tmnt_parent_id_old: %u\n", smbuf->mnt_parent_id_old);
+ if (rawflag) {
+ printf("\tmnt_attr: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_attr);
+ printf("\tmnt_propagation: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_propagation);
+ } else {
+ mask_to_string(mount_attrs, smbuf->mnt_attr, ",", buf,
+ sizeof(buf));
+ printf("\tmnt_attr: {%s}\n", buf);
+ mask_to_string(mount_prop_flags, smbuf->mnt_propagation,
+ ",", buf, sizeof(buf));
+ printf("\tmnt_propagation: {%s}\n", buf);
+ }
+ printf("\tmnt_peer_group: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_peer_group);
+ printf("\tmnt_master: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_master);
+ }
+
+ if (smbuf->mask & STATMOUNT_PROPAGATE_FROM)
+ printf("\tpropagate_from: 0x%llx\n",
+ (unsigned long long)smbuf->propagate_from);
+
+ if (smbuf->mask & STATMOUNT_MNT_ROOT)
+ printf("\tmnt_root: %s\n", smbuf->str + smbuf->mnt_root);
+ if (smbuf->mask & STATMOUNT_MNT_POINT)
+ printf("\tmnt_point: %s\n", smbuf->str + smbuf->mnt_point);
+ if (smbuf->mask & STATMOUNT_FS_TYPE)
+ printf("\tfs_type: %s\n", smbuf->str + smbuf->fs_type);
+ if (smbuf->mask & STATMOUNT_FS_SUBTYPE)
+ printf("\tfs_subtype: %s\n", smbuf->str + smbuf->fs_subtype);
+
+ if (smbuf->mask & STATMOUNT_MNT_NS_ID)
+ printf("\tmnt_ns_id: 0x%llx\n",
+ (unsigned long long)smbuf->mnt_ns_id);
+
+ if (smbuf->mask & STATMOUNT_MNT_OPTS)
+ printf("\tmnt_opts: %s\n", smbuf->str + smbuf->mnt_opts);
+ if (smbuf->mask & STATMOUNT_SB_SOURCE)
+ printf("\tsb_source: %s\n", smbuf->str + smbuf->sb_source);
+
+ if (smbuf->mask & STATMOUNT_SUPPORTED_MASK) {
+ if (rawflag) {
+ printf("\tsupported_mask: 0x%llx\n",
+ (unsigned long long)smbuf->supported_mask);
+ } else {
+ mask_to_string(statmount_funcs, smbuf->supported_mask,
+ ",", buf, sizeof(buf));
+ printf("\tsupported_mask: {%s}\n", buf);
+ }
+ }
+}
+
+static inline bool
+match_mount(
+ const struct statmount *smbuf,
+ const char *fstype)
+{
+ char real_fstype[256];
+
+ if (!fstype)
+ return true;
+
+ if (!(smbuf->mask & STATMOUNT_FS_TYPE))
+ return false;
+
+ if (smbuf->mask & STATMOUNT_FS_SUBTYPE)
+ snprintf(real_fstype, sizeof(fstype), "%s.%s",
+ smbuf->str + smbuf->fs_type,
+ smbuf->str + smbuf->fs_subtype);
+ else
+ snprintf(real_fstype, sizeof(fstype), "%s",
+ smbuf->str + smbuf->fs_type);
+
+ return strcmp(fstype, real_fstype) == 0;
+}
+
+static void
+listmount_help(void)
+{
+ printf(_(
+"\n"
+" List all mounted filesystems.\n"
+"\n"
+" -f -- statmount mask flags to set. Defaults to all possible flags.\n"
+" -i -- only list mounts below this mount id. Defaults to the rootdir.\n"
+" -n -- path to a procfs mount namespace file.\n"
+" -r -- do not decode flags fields into strings.\n"
+" -t -- only display mount info for this fs type.\n"
+));
+}
+
+#define NR_MNT_IDS 7
+
+static int
+listmount_f(
+ int argc,
+ char **argv)
+{
+ uint64_t mnt_ids[NR_MNT_IDS];
+ uint64_t cursor = LISTMOUNT_INIT_CURSOR;
+ uint64_t statmount_flags = -1ULL;
+ uint64_t mnt_id = LSMT_ROOT;
+ struct statmount *smbuf;
+ const char *fstype = NULL;
+ unsigned long long rows = 0;
+ const size_t smbuf_size = libfrog_statmount_sizeof(4096);
+ int mnt_ns_fd = DEFAULT_MOUNTNS_FD;
+ int rawflag = 0;
+ int c;
+ int ret;
+
+ while ((c = getopt(argc, argv, "f:i:n:rt:")) > 0) {
+ switch (c) {
+ case 'f':
+ errno = 0;
+ statmount_flags = strtoull(optarg, NULL, 0);
+ if (errno) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+ case 'i':
+ errno = 0;
+ mnt_id = strtoull(optarg, NULL, 0);
+ if (errno) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+ case 'n':
+ mnt_ns_fd = open(optarg, O_RDONLY);
+ if (mnt_ns_fd < 0) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+ case 'r':
+ rawflag++;
+ break;
+ case 't':
+ fstype = optarg;
+ break;
+ default:
+ listmount_help();
+ return 1;
+ }
+ }
+
+ smbuf = malloc(smbuf_size);
+ if (!smbuf) {
+ perror("malloc");
+ return 1;
+ }
+
+ if (fstype)
+ statmount_flags |= STATMOUNT_FS_TYPE | STATMOUNT_FS_SUBTYPE;
+
+ while ((ret = libfrog_listmount(mnt_id, mnt_ns_fd, &cursor,
+ mnt_ids, NR_MNT_IDS)) > 0) {
+ for (c = 0; c < ret; c++) {
+ ret = libfrog_statmount(mnt_ids[c], mnt_ns_fd,
+ statmount_flags, smbuf, smbuf_size);
+ if (ret) {
+ perror("statmount");
+ goto out_smbuf;
+ }
+
+ if (!match_mount(smbuf, fstype))
+ continue;
+
+ printf("mnt_id[%llu]: 0x%llx\n",
+ (unsigned long long)rows++,
+ (unsigned long long)mnt_ids[c]);
+
+ dump_mountinfo(smbuf, rawflag);
+ }
+ }
+
+ if (ret < 0)
+ perror("listmount");
+
+out_smbuf:
+ free(smbuf);
+ return 0;
+}
+
+static const struct cmdinfo listmount_cmd = {
+ .name = "listmount",
+ .cfunc = listmount_f,
+ .argmin = -1,
+ .argmax = -1,
+ .flags = CMD_NOFILE_OK | CMD_FOREIGN_OK | CMD_NOMAP_OK,
+ .oneline = N_("list mounted filesystems"),
+ .help = listmount_help,
+};
+
+static void
+statmount_help(void)
+{
+ printf(_(
+"\n"
+" Print statmount information for the open file.\n"
+"\n"
+" -f -- statmount mask flags to set. Defaults to all possible flags.\n"
+" -r -- do not decode flags fields into strings.\n"
+));
+}
+
+static int
+statmount_f(
+ int argc,
+ char **argv)
+{
+ uint64_t statmount_flags = -1ULL;
+ struct statmount *smbuf;
+ const size_t smbuf_size = libfrog_statmount_sizeof(4096);
+ int rawflag = 0;
+ int c;
+ int ret;
+
+ while ((c = getopt(argc, argv, "f:r")) > 0) {
+ switch (c) {
+ case 'f':
+ errno = 0;
+ statmount_flags = strtoull(optarg, NULL, 0);
+ if (errno) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+ case 'r':
+ rawflag++;
+ break;
+ default:
+ listmount_help();
+ return 1;
+ }
+ }
+
+ smbuf = malloc(smbuf_size);
+ if (!smbuf) {
+ perror("malloc");
+ return 1;
+ }
+
+ ret = libfrog_fstatmount(file->fd, statmount_flags, smbuf, smbuf_size);
+ if (ret) {
+ perror("statmount");
+ goto out_smbuf;
+ }
+
+ printf("path: %s\n", file->name);
+
+ dump_mountinfo(smbuf, rawflag);
+
+out_smbuf:
+ free(smbuf);
+ return 0;
+}
+
+static const struct cmdinfo statmount_cmd = {
+ .name = "statmount",
+ .cfunc = statmount_f,
+ .argmin = -1,
+ .argmax = -1,
+ .flags = CMD_FOREIGN_OK | CMD_NOMAP_OK,
+ .oneline = N_("statmount the open file"),
+ .help = statmount_help,
+};
+
+void
+listmount_init(void)
+{
+ add_command(&listmount_cmd);
+ add_command(&statmount_cmd);
+}
diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8
index 2090cd4c0b2641..61defcc377163a 100644
--- a/man/man8/xfs_io.8
+++ b/man/man8/xfs_io.8
@@ -1766,6 +1766,72 @@ .SH FILESYSTEM COMMANDS
.TP
.BI "removefsprops " name " [ " names "... ]"
Remove the given filesystem properties.
+.TP
+.BI "listmount [ \-f " mask " ] [ \-i " mnt_id " ] [ \-n " path " ] [ \-r ] [ \-t" fstype " ]"
+Print information about the mounted filesystems in a particular mount
+namespace.
+The information returned by this call corresponds to the information returned
+by the
+.BR statmount (2)
+system call.
+
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.BI "\-f " mask
+Pass this numeric argument as the mask argument to
+.BR statmount (8).
+Defaults to all bits set, to retrieve all possible information.
+
+.TP
+.BI "\-i " mnt_id
+Only return information for mounts below this mount in the mount tree.
+Defaults to the root directory.
+
+.TP
+.BI "\-n " path
+Return information for the mount namespace given by this procfs path.
+For a given process, the path will most likely look like
+.BI /proc/ $pid /ns/mnt
+though any path can be provided.
+Defaults to the mount namespace of the
+.B xfs_io
+process itself.
+
+.TP
+.B \-r
+Print raw bitmasks instead of converting them to strings.
+
+.TP
+.BI "\-t " fstype
+Only return information for filesystems of this type.
+If not specified, no filtering is performed.
+.RE
+
+.TP
+.BI "statmount [ \-f " mask " ] [ \-r ]"
+Print information about the mounted filesystem for each open file.
+The information returned by this call corresponds to the information returned
+by the
+.BR statmount (2)
+system call.
+
+.RE
+.RS 1.0i
+.PD 0
+.TP
+.BI "\-f " mask
+Pass this numeric argument as the mask argument to
+.BR statmount (8).
+Defaults to all bits set, to retrieve all possible information.
+
+.TP
+.B \-r
+Print raw bitmasks instead of converting them to strings.
+
+.RE
+.PD
.SH OTHER COMMANDS
.TP
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 24/26] mkfs: enable online repair if all backrefs are enabled
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (22 preceding siblings ...)
2026-03-19 4:44 ` [PATCH 23/26] xfs_io: add listmount and statmount commands Darrick J. Wong
@ 2026-03-19 4:45 ` Darrick J. Wong
2026-03-19 4:45 ` [PATCH 25/26] debian/control: listify the build dependencies Darrick J. Wong
2026-03-19 4:45 ` [PATCH 26/26] debian: enable xfs_healer on the root filesystem by default Darrick J. Wong
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:45 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
If all backreferences are enabled in the filesystem, then enable online
repair by default if the user didn't supply any other autofsck setting.
Users might as well get full self-repair capability if they're paying
for the extra metadata.
Note that it's up to each distro to enable the systemd services
according to their own service activation policies. Debian policy is to
enable all systemd services at package installation but they don't
enable online fsck in their Kconfig so the services won't activate.
RHEL and SUSE policy requires sysadmins to enable them explicitly unless
the OS vendor also ships a systemd preset file enabling the services.
Distros without systemd won't get any of the systemd services,
obviously.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
mkfs/xfs_mkfs.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 527a662f3ac858..f859626afdda36 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -6296,6 +6296,15 @@ main(
if (mp->m_sb.sb_agcount > 1)
rewrite_secondary_superblocks(mp);
+ /*
+ * If the filesystem has full backreferences and the user didn't
+ * express an autofsck preference, enable online repair because they
+ * might as well get some useful functionality from the extra metadata.
+ */
+ if (cli.autofsck == FSPROP_AUTOFSCK_UNSET &&
+ cli.sb_feat.rmapbt && cli.sb_feat.parent_pointers)
+ cli.autofsck = FSPROP_AUTOFSCK_REPAIR;
+
if (cli.autofsck != FSPROP_AUTOFSCK_UNSET)
set_autofsck(mp, &cli);
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 25/26] debian/control: listify the build dependencies
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (23 preceding siblings ...)
2026-03-19 4:45 ` [PATCH 24/26] mkfs: enable online repair if all backrefs are enabled Darrick J. Wong
@ 2026-03-19 4:45 ` Darrick J. Wong
2026-03-19 4:45 ` [PATCH 26/26] debian: enable xfs_healer on the root filesystem by default Darrick J. Wong
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:45 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
This will make it less gross to add more build deps later.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
debian/control | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/debian/control b/debian/control
index 6473c10be7f7d6..d50960fba205bb 100644
--- a/debian/control
+++ b/debian/control
@@ -3,7 +3,19 @@ Section: admin
Priority: optional
Maintainer: XFS Development Team <linux-xfs@vger.kernel.org>
Uploaders: Nathan Scott <nathans@debian.org>, Anibal Monsalve Salazar <anibal@debian.org>
-Build-Depends: libinih-dev (>= 53), uuid-dev, debhelper (>= 12), gettext, libtool, libedit-dev, libblkid-dev (>= 2.17), linux-libc-dev, libdevmapper-dev, libicu-dev, pkg-config, liburcu-dev, systemd-dev | systemd (<< 253-2~)
+Build-Depends: debhelper (>= 12),
+ gettext,
+ libblkid-dev (>= 2.17),
+ libdevmapper-dev,
+ libedit-dev,
+ libicu-dev,
+ libinih-dev (>= 53),
+ libtool,
+ liburcu-dev,
+ linux-libc-dev,
+ pkg-config,
+ systemd-dev | systemd (<< 253-2~),
+ uuid-dev
Standards-Version: 4.0.0
Homepage: https://xfs.wiki.kernel.org/
^ permalink raw reply related [flat|nested] 71+ messages in thread* [PATCH 26/26] debian: enable xfs_healer on the root filesystem by default
2026-03-19 4:38 ` [PATCHSET v10 1/2] " Darrick J. Wong
` (24 preceding siblings ...)
2026-03-19 4:45 ` [PATCH 25/26] debian/control: listify the build dependencies Darrick J. Wong
@ 2026-03-19 4:45 ` Darrick J. Wong
25 siblings, 0 replies; 71+ messages in thread
From: Darrick J. Wong @ 2026-03-19 4:45 UTC (permalink / raw)
To: aalbersh, djwong; +Cc: hch, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Now that we're finished building autonomous repair, enable the healer
service on the root filesystem by default. The root filesystem is
mounted by the initrd prior to starting systemd, which is why the
xfs_healer_start service cannot autostart the service for the root
filesystem.
dh_installsystemd won't activate a template service (aka one with an
at-sign in the name) even if it provides a DefaultInstance directive to
make that possible. Hence we enable this explicitly via the postinst
script.
Note that Debian enables services by default upon package installation,
so this is consistent with their policies. Their kernel doesn't enable
online fsck, so healer won't do much more than monitor for corruptions
and log them.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
debian/postinst | 8 ++++++++
debian/prerm | 13 +++++++++++++
debian/rules | 3 ++-
3 files changed, 23 insertions(+), 1 deletion(-)
create mode 100644 debian/prerm
diff --git a/debian/postinst b/debian/postinst
index d11c8d94a3cbe4..966dbb7626cab3 100644
--- a/debian/postinst
+++ b/debian/postinst
@@ -21,5 +21,13 @@ case "${1}" in
esac
#DEBHELPER#
+#
+# dh_installsystemd doesn't handle template services even if we supply a
+# default instance, so we'll install it here.
+if [ -z "${DPKG_ROOT:-}" ] && [ -d /run/systemd/system ] ; then
+ if [ "$1" = "configure" ] || [ "$1" = "abort-upgrade" ] || [ "$1" = "abort-deconfigure" ] || [ "$1" = "abort-remove" ] ; then
+ /bin/systemctl enable xfs_healer@.service || true
+ fi
+fi
exit 0
diff --git a/debian/prerm b/debian/prerm
new file mode 100644
index 00000000000000..c526dcdd1d7103
--- /dev/null
+++ b/debian/prerm
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+set -e
+
+# dh_installsystemd doesn't handle template services even if we supply a
+# default instance, so we'll install it here.
+if [ -z "${DPKG_ROOT:-}" ] && [ "$1" = remove ] && [ -d /run/systemd/system ] ; then
+ /bin/systemctl disable xfs_healer@.service || true
+fi
+
+#DEBHELPER#
+
+exit 0
diff --git a/debian/rules b/debian/rules
index 7c9f90e6c483ff..aaf99a95ce3df5 100755
--- a/debian/rules
+++ b/debian/rules
@@ -97,4 +97,5 @@ override_dh_installdocs:
dh_installdocs -XCHANGES
override_dh_installsystemd:
- dh_installsystemd -p xfsprogs --no-restart-after-upgrade --no-stop-on-upgrade system-xfs_scrub.slice xfs_scrub_all.timer
+ dh_installsystemd -p xfsprogs --no-restart-after-upgrade --no-stop-on-upgrade system-xfs_scrub.slice xfs_scrub_all.timer system-xfs_healer.slice
+ dh_installsystemd -p xfsprogs --restart-after-upgrade xfs_healer_start.service
^ permalink raw reply related [flat|nested] 71+ messages in thread