[PATCH 02/11] xfs: start creating infrastructure for health monitoring

Linux filesystem development
 help / color / mirror / Atom feed

* [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-06  7:10 [PATCHSET V4] xfs: autonomous self healing of filesystems Darrick J. Wong
@ 2026-01-06  7:11 ` Darrick J. Wong
  2026-01-07  9:17   ` Christoph Hellwig
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-06  7:11 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs, hch, linux-fsdevel

From: Darrick J. Wong <djwong@kernel.org>

Start creating helper functions and infrastructure to pass filesystem
health events to a health monitoring file.  Since this is an
administrative interface, we only support a single health monitor
process per filesystem, so we don't need to use anything fancy such as
notifier chains (== tons of indirect calls).

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    8 +
 fs/xfs/xfs_healthmon.h |   36 ++++++
 fs/xfs/xfs_mount.h     |    4 +
 fs/xfs/Makefile        |    1 
 fs/xfs/xfs_health.c    |    1 
 fs/xfs/xfs_healthmon.c |  294 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_mount.c     |    2 
 8 files changed, 350 insertions(+)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_healthmon.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 12463ba766da05..dba7896f716092 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,6 +1003,13 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+struct xfs_health_monitor {
+	__u64	flags;		/* flags */
+	__u8	format;		/* output format */
+	__u8	pad1[7];	/* zeroes */
+	__u64	pad2[2];	/* zeroes */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1042,6 +1049,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
+#define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
new file mode 100644
index 00000000000000..218d5aac87b012
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_HEALTHMON_H__
+#define __XFS_HEALTHMON_H__
+
+struct xfs_healthmon {
+	/*
+	 * Weak reference to the xfs filesystem that is being monitored.  It
+	 * will be set to zero when the filesystem detaches from the monitor.
+	 * Do not dereference this pointer.
+	 */
+	uintptr_t			mount_cookie;
+
+	/*
+	 * Device number of the filesystem being monitored.  This is for
+	 * consistent tracing even after unmount.
+	 */
+	dev_t				dev;
+
+	/*
+	 * Reference count of this structure.  The open healthmon fd holds one
+	 * ref, the xfs_mount holds another ref if it points to this object,
+	 * and running event handlers hold their own refs.
+	 */
+	refcount_t			ref;
+};
+
+void xfs_healthmon_unmount(struct xfs_mount *mp);
+
+long xfs_ioc_health_monitor(struct file *file,
+		struct xfs_health_monitor __user *arg);
+
+#endif /* __XFS_HEALTHMON_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b871dfde372b52..61c71128d171cb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_healthmon;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
@@ -342,6 +343,9 @@ typedef struct xfs_mount {
 
 	/* Hook to feed dirent updates to an active online repair. */
 	struct xfs_hooks	m_dir_update_hooks;
+
+	/* Private data referring to a health monitor object. */
+	struct xfs_healthmon	*m_healthmon;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bf501cf827172..1b7385e23b3463 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_handle.o \
 				   xfs_health.o \
+				   xfs_healthmon.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index fbb8886c72fe5e..3d50397f8f7c00 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
new file mode 100644
index 00000000000000..3fdac72b478f3f
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/eventpoll.h>
+#include <linux/poll.h>
+
+/*
+ * Live Health Monitoring
+ * ======================
+ *
+ * Autonomous self-healing of XFS filesystems requires a means for the kernel
+ * to send filesystem health events to a monitoring daemon in userspace.  To
+ * accomplish this, we establish a thread_with_file kthread object to handle
+ * translating internal events about filesystem health into a format that can
+ * be parsed easily by userspace.  Then we hook various parts of the filesystem
+ * to supply those internal events to the kthread.  Userspace reads events
+ * from the file descriptor returned by the ioctl.
+ *
+ * The healthmon abstraction has a weak reference to the host filesystem mount
+ * so that the queueing and processing of the events do not pin the mount and
+ * cannot slow down the main filesystem.  The healthmon object can exist past
+ * the end of the filesystem mount.
+ */
+
+/* sign of a detached health monitor */
+#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
+
+/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+static DEFINE_SPINLOCK(xfs_healthmon_lock);
+
+/* Grab a reference to the healthmon object for a given mount, if any. */
+static struct xfs_healthmon *
+xfs_healthmon_get(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm;
+
+	rcu_read_lock();
+	hm = mp->m_healthmon;
+	if (hm && !refcount_inc_not_zero(&hm->ref))
+		hm = NULL;
+	rcu_read_unlock();
+
+	return hm;
+}
+
+/*
+ * Free the health monitor after an RCU grace period to eliminate possibility
+ * of races with xfs_healthmon_get.
+ */
+static inline void
+xfs_healthmon_free(
+	struct xfs_healthmon		*hm)
+{
+	kfree_rcu_mightsleep(hm);
+}
+
+/*
+ * Release the reference to a healthmon object and free it if there are no
+ * more holders.
+ */
+static void
+xfs_healthmon_put(
+	struct xfs_healthmon		*hm)
+{
+	if (refcount_dec_and_test(&hm->ref))
+		xfs_healthmon_free(hm);
+}
+
+/* Is this health monitor active? */
+static inline bool
+xfs_healthmon_activated(
+	struct xfs_healthmon	*hm)
+{
+	return hm->mount_cookie != DETACHED_MOUNT_COOKIE;
+}
+
+/* Is this health monitor watching the given filesystem? */
+static inline bool
+xfs_healthmon_covers_fs(
+	struct xfs_healthmon	*hm,
+	struct super_block	*sb)
+{
+	return hm->mount_cookie == (uintptr_t)sb;
+}
+
+/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+STATIC int
+xfs_healthmon_attach(
+	struct xfs_mount	*mp,
+	struct xfs_healthmon	*hm)
+{
+	int			ret = 0;
+
+	spin_lock(&xfs_healthmon_lock);
+	if (mp->m_healthmon == NULL) {
+		mp->m_healthmon = hm;
+		hm->mount_cookie = (uintptr_t)mp->m_super;
+		refcount_inc(&hm->ref);
+	} else {
+		ret = -EEXIST;
+	}
+	spin_unlock(&xfs_healthmon_lock);
+
+	return ret;
+}
+
+/* Detach a xfs mount from a specific healthmon instance. */
+STATIC void
+xfs_healthmon_detach(
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (xfs_healthmon_activated(hm)) {
+		struct xfs_mount	*mp =
+			XFS_M((struct super_block *)hm->mount_cookie);
+
+		mp->m_healthmon = NULL;
+		hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+	} else {
+		hm = NULL;
+	}
+	spin_unlock(&xfs_healthmon_lock);
+
+	if (hm)
+		xfs_healthmon_put(hm);
+}
+
+/* Detach the xfs mount from this healthmon instance. */
+void
+xfs_healthmon_unmount(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	xfs_healthmon_detach(hm);
+	xfs_healthmon_put(hm);
+}
+
+STATIC ssize_t
+xfs_healthmon_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	return -EIO;
+}
+
+/* Free the health monitoring information. */
+STATIC int
+xfs_healthmon_release(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	/*
+	 * We might be closing the healthmon file before the filesystem
+	 * unmounts, because userspace processes can terminate at any time and
+	 * for any reason.  Null out xfs_mount::m_healthmon so that another
+	 * process can create another health monitor file.
+	 */
+	xfs_healthmon_detach(hm);
+
+	xfs_healthmon_put(hm);
+	return 0;
+}
+
+/* Validate ioctl parameters. */
+static inline bool
+xfs_healthmon_validate(
+	const struct xfs_health_monitor	*hmo)
+{
+	if (hmo->flags)
+		return false;
+	if (hmo->format)
+		return false;
+	if (memchr_inv(&hmo->pad1, 0, sizeof(hmo->pad1)))
+		return false;
+	if (memchr_inv(&hmo->pad2, 0, sizeof(hmo->pad2)))
+		return false;
+	return true;
+}
+
+/* Emit some data about the health monitoring fd. */
+static void
+xfs_healthmon_show_fdinfo(
+	struct seq_file		*m,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+			xfs_healthmon_activated(hm) ? "alive" : "dead",
+			MAJOR(hm->dev), MINOR(hm->dev));
+}
+
+static const struct file_operations xfs_healthmon_fops = {
+	.owner		= THIS_MODULE,
+	.show_fdinfo	= xfs_healthmon_show_fdinfo,
+	.read_iter	= xfs_healthmon_read_iter,
+	.release	= xfs_healthmon_release,
+};
+
+/*
+ * Create a health monitoring file.  Returns an index to the fd table or a
+ * negative errno.
+ */
+long
+xfs_ioc_health_monitor(
+	struct file			*file,
+	struct xfs_health_monitor __user *arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				ret;
+
+	/*
+	 * The only intended user of the health monitoring system should be the
+	 * xfs_healer daemon running on behalf of the whole filesystem in the
+	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
+	 * (they can use fsnotify) nor do we allow containers.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (ip->i_ino != mp->m_sb.sb_rootino)
+		return -EPERM;
+	if (current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+	if (!hm)
+		return -ENOMEM;
+	hm->dev = mp->m_super->s_dev;
+	refcount_set(&hm->ref, 1);
+
+	/*
+	 * Try to attach this health monitor to the xfs_mount.  The monitor is
+	 * considered live and will receive events if this succeeds.
+	 */
+	ret = xfs_healthmon_attach(mp, hm);
+	if (ret)
+		goto out_hm;
+
+	/*
+	 * Create the anonymous file and install a fd for it.  If it succeeds,
+	 * the file owns hm and can go away at any time, so we must not access
+	 * it again.  This must go last because we can't undo a fd table
+	 * installation.
+	 */
+	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+			O_CLOEXEC | O_RDONLY);
+	if (ret < 0)
+		goto out_mp;
+
+	return ret;
+
+out_mp:
+	xfs_healthmon_detach(hm);
+out_hm:
+	ASSERT(refcount_read(&hm->ref) == 1);
+	xfs_healthmon_put(hm);
+	return ret;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad77437181..c04c41ca924e37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1419,6 +1420,9 @@ xfs_file_ioctl(
 	case XFS_IOC_COMMIT_RANGE:
 		return xfs_ioc_commit_range(filp, arg);
 
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_ioc_health_monitor(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0953f6ae94abc8..ab67c91915384c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -625,6 +626,7 @@ xfs_unmount_flush_inodes(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
+	xfs_healthmon_unmount(mp);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-06  7:11 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
@ 2026-01-07  9:17   ` Christoph Hellwig
  2026-01-07 18:50     ` Darrick J. Wong
  0 siblings, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-07  9:17 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, linux-xfs, hch, linux-fsdevel

On Mon, Jan 05, 2026 at 11:11:08PM -0800, Darrick J. Wong wrote:
> +struct xfs_health_monitor {
> +	__u64	flags;		/* flags */
> +	__u8	format;		/* output format */
> +	__u8	pad1[7];	/* zeroes */
> +	__u64	pad2[2];	/* zeroes */
> +};

Why not use a single __u8-based padding field?

> +struct xfs_healthmon {
> +	/*
> +	 * Weak reference to the xfs filesystem that is being monitored.  It
> +	 * will be set to zero when the filesystem detaches from the monitor.
> +	 * Do not dereference this pointer.
> +	 */
> +	uintptr_t			mount_cookie;
> +
> +	/*
> +	 * Device number of the filesystem being monitored.  This is for
> +	 * consistent tracing even after unmount.
> +	 */
> +	dev_t				dev;

It isn't really used for tracking, but just in a single print, right?

> + * be parsed easily by userspace.  Then we hook various parts of the filesystem

Is the hooking terminology still right?

> + * The healthmon abstraction has a weak reference to the host filesystem mount
> + * so that the queueing and processing of the events do not pin the mount and
> + * cannot slow down the main filesystem.  The healthmon object can exist past
> + * the end of the filesystem mount.
> + */
> +
> +/* sign of a detached health monitor */
> +#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)

This almost looks like a not performance optimized version of hazard
pointers.  Not that we care much about performance here.

> +/*
> + * Free the health monitor after an RCU grace period to eliminate possibility
> + * of races with xfs_healthmon_get.
> + */
> +static inline void
> +xfs_healthmon_free(
> +	struct xfs_healthmon		*hm)
> +{
> +	kfree_rcu_mightsleep(hm);
> +}

Is there much of a point in this wrapper vs just open coding the call to
kfree_rcu_mightsleep in the only caller?

> +/* Is this health monitor active? */
> +static inline bool
> +xfs_healthmon_activated(
> +	struct xfs_healthmon	*hm)
> +{
> +	return hm->mount_cookie != DETACHED_MOUNT_COOKIE;
> +}
> +
> +/* Is this health monitor watching the given filesystem? */
> +static inline bool
> +xfs_healthmon_covers_fs(
> +	struct xfs_healthmon	*hm,
> +	struct super_block	*sb)
> +{
> +	return hm->mount_cookie == (uintptr_t)sb;
> +}

Is there much of a point in these helpers vs open coding them in the callers?
(no caller yet in this patch of the second one anyway).  Especially as we
need to hold a lock for them to be safe.

> +
> +/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
> +STATIC int
> +xfs_healthmon_attach(
> +	struct xfs_mount	*mp,
> +	struct xfs_healthmon	*hm)
> +{
> +	int			ret = 0;
> +
> +	spin_lock(&xfs_healthmon_lock);
> +	if (mp->m_healthmon == NULL) {
> +		mp->m_healthmon = hm;
> +		hm->mount_cookie = (uintptr_t)mp->m_super;
> +		refcount_inc(&hm->ref);
> +	} else {
> +		ret = -EEXIST;
> +	}
> +	spin_unlock(&xfs_healthmon_lock);
> +
> +	return ret;

Maybe just me, but I'd do away with the ret variable and just handle the
EEXIST case directly:

	spin_lock(&xfs_healthmon_lock);
	if (mp->m_healthmon) {
		spin_unlock(&xfs_healthmon_lock);
		return -EEXIST;
	}
	refcount_inc(&hm->ref);
	mp->m_healthmon = hm;
	hm->mount_cookie = (uintptr_t)mp->m_super;
	spin_unlock(&xfs_healthmon_lock);
	return 0;

> +/* Detach a xfs mount from a specific healthmon instance. */
> +STATIC void
> +xfs_healthmon_detach(
> +	struct xfs_healthmon	*hm)
> +{
> +	spin_lock(&xfs_healthmon_lock);
> +	if (xfs_healthmon_activated(hm)) {
> +		struct xfs_mount	*mp =
> +			XFS_M((struct super_block *)hm->mount_cookie);
> +
> +		mp->m_healthmon = NULL;
> +		hm->mount_cookie = DETACHED_MOUNT_COOKIE;
> +	} else {
> +		hm = NULL;
> +	}
> +	spin_unlock(&xfs_healthmon_lock);
> +
> +	if (hm)
> +		xfs_healthmon_put(hm);
> +}

Kinda similar here:

	struct xfs_mount	*mp;

	spin_lock(&xfs_healthmon_lock);
	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
		spin_unlock(&xfs_healthmon_lock);
		return;
	}

	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
	spin_unlock(&xfs_healthmon_lock);

	xfs_healthmon_put(hm);


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-07  9:17   ` Christoph Hellwig
@ 2026-01-07 18:50     ` Darrick J. Wong
  2026-01-08 10:21       ` Christoph Hellwig
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-07 18:50 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: cem, linux-xfs, linux-fsdevel

On Wed, Jan 07, 2026 at 10:17:13AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 05, 2026 at 11:11:08PM -0800, Darrick J. Wong wrote:
> > +struct xfs_health_monitor {
> > +	__u64	flags;		/* flags */
> > +	__u8	format;		/* output format */
> > +	__u8	pad1[7];	/* zeroes */
> > +	__u64	pad2[2];	/* zeroes */
> > +};
> 
> Why not use a single __u8-based padding field?

Ok.

> > +struct xfs_healthmon {
> > +	/*
> > +	 * Weak reference to the xfs filesystem that is being monitored.  It
> > +	 * will be set to zero when the filesystem detaches from the monitor.
> > +	 * Do not dereference this pointer.
> > +	 */
> > +	uintptr_t			mount_cookie;
> > +
> > +	/*
> > +	 * Device number of the filesystem being monitored.  This is for
> > +	 * consistent tracing even after unmount.
> > +	 */
> > +	dev_t				dev;
> 
> It isn't really used for tracking, but just in a single print, right?

It's used for tracepoints and fdinfo.

> > + * be parsed easily by userspace.  Then we hook various parts of the filesystem
> 
> Is the hooking terminology still right?

I still think of the entry points as hooks, but I'll reword it to avoid
confusion with the actual xfs_hooks:

"When those internal events occur, the filesystem will call this health
monitor to convey them to userspace."

> > + * The healthmon abstraction has a weak reference to the host filesystem mount
> > + * so that the queueing and processing of the events do not pin the mount and
> > + * cannot slow down the main filesystem.  The healthmon object can exist past
> > + * the end of the filesystem mount.
> > + */
> > +
> > +/* sign of a detached health monitor */
> > +#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
> 
> This almost looks like a not performance optimized version of hazard
> pointers.  Not that we care much about performance here.

Yep.  AFAIK the kernel doesn't have an actual hazard pointer
implementation that we could latch onto, right?

> > +/*
> > + * Free the health monitor after an RCU grace period to eliminate possibility
> > + * of races with xfs_healthmon_get.
> > + */
> > +static inline void
> > +xfs_healthmon_free(
> > +	struct xfs_healthmon		*hm)
> > +{
> > +	kfree_rcu_mightsleep(hm);
> > +}
> 
> Is there much of a point in this wrapper vs just open coding the call to
> kfree_rcu_mightsleep in the only caller?

No, and indeed this could be compressed into _healthmon_put:

	if (refcount_dec_and_test(&hm->ref)) {
		while (hm->first_event) {
			/* free hm->first_event */
		}
		kfree(hm->buffer);
		mutex_destroy(&hm->lock);
		kfree_rcu_mightsleep(hm);
	}

which would be much easier to think about.

> > +/* Is this health monitor active? */
> > +static inline bool
> > +xfs_healthmon_activated(
> > +	struct xfs_healthmon	*hm)
> > +{
> > +	return hm->mount_cookie != DETACHED_MOUNT_COOKIE;
> > +}
> > +
> > +/* Is this health monitor watching the given filesystem? */
> > +static inline bool
> > +xfs_healthmon_covers_fs(
> > +	struct xfs_healthmon	*hm,
> > +	struct super_block	*sb)
> > +{
> > +	return hm->mount_cookie == (uintptr_t)sb;
> > +}
> 
> Is there much of a point in these helpers vs open coding them in the callers?
> (no caller yet in this patch of the second one anyway).  Especially as we
> need to hold a lock for them to be safe.

The only one really worth keeping is _healthmon_activated because it
gets called from various places.  And even then, it now only has three
callsites so maybe it'll just go away.

> > +
> > +/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
> > +STATIC int
> > +xfs_healthmon_attach(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_healthmon	*hm)
> > +{
> > +	int			ret = 0;
> > +
> > +	spin_lock(&xfs_healthmon_lock);
> > +	if (mp->m_healthmon == NULL) {
> > +		mp->m_healthmon = hm;
> > +		hm->mount_cookie = (uintptr_t)mp->m_super;
> > +		refcount_inc(&hm->ref);
> > +	} else {
> > +		ret = -EEXIST;
> > +	}
> > +	spin_unlock(&xfs_healthmon_lock);
> > +
> > +	return ret;
> 
> Maybe just me, but I'd do away with the ret variable and just handle the
> EEXIST case directly:
> 
> 	spin_lock(&xfs_healthmon_lock);
> 	if (mp->m_healthmon) {
> 		spin_unlock(&xfs_healthmon_lock);
> 		return -EEXIST;
> 	}
> 	refcount_inc(&hm->ref);
> 	mp->m_healthmon = hm;
> 	hm->mount_cookie = (uintptr_t)mp->m_super;
> 	spin_unlock(&xfs_healthmon_lock);
> 	return 0;
> 
> > +/* Detach a xfs mount from a specific healthmon instance. */
> > +STATIC void
> > +xfs_healthmon_detach(
> > +	struct xfs_healthmon	*hm)
> > +{
> > +	spin_lock(&xfs_healthmon_lock);
> > +	if (xfs_healthmon_activated(hm)) {
> > +		struct xfs_mount	*mp =
> > +			XFS_M((struct super_block *)hm->mount_cookie);
> > +
> > +		mp->m_healthmon = NULL;
> > +		hm->mount_cookie = DETACHED_MOUNT_COOKIE;
> > +	} else {
> > +		hm = NULL;
> > +	}
> > +	spin_unlock(&xfs_healthmon_lock);
> > +
> > +	if (hm)
> > +		xfs_healthmon_put(hm);
> > +}
> 
> Kinda similar here:
> 
> 	struct xfs_mount	*mp;
> 
> 	spin_lock(&xfs_healthmon_lock);
> 	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
> 		spin_unlock(&xfs_healthmon_lock);
> 		return;
> 	}
> 
> 	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
> 	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
> 	spin_unlock(&xfs_healthmon_lock);
> 
> 	xfs_healthmon_put(hm);

Will change.  And get rid of some of the mount cookie helpers.

Thanks for reading!

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-07 18:50     ` Darrick J. Wong
@ 2026-01-08 10:21       ` Christoph Hellwig
  0 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-08 10:21 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, cem, linux-xfs, linux-fsdevel

On Wed, Jan 07, 2026 at 10:50:55AM -0800, Darrick J. Wong wrote:
> > This almost looks like a not performance optimized version of hazard
> > pointers.  Not that we care much about performance here.
> 
> Yep.  AFAIK the kernel doesn't have an actual hazard pointer
> implementation that we could latch onto, right?

It's in the works.  And probably complete overkill here, it your
code just reminded me of a presentation Paul gave on it.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-13  0:32 [PATCHSET v5] xfs: autonomous self healing of filesystems Darrick J. Wong
@ 2026-01-13  0:33 ` Darrick J. Wong
  2026-01-13 16:03   ` Christoph Hellwig
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-13  0:33 UTC (permalink / raw)
  To: djwong, cem; +Cc: hch, linux-fsdevel, linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Start creating helper functions and infrastructure to pass filesystem
health events to a health monitoring file.  Since this is an
administrative interface, we only support a single health monitor
process per filesystem, so we don't need to use anything fancy such as
notifier chains (== tons of indirect calls).

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    7 +
 fs/xfs/xfs_healthmon.h |   36 +++++++
 fs/xfs/xfs_mount.h     |    4 +
 fs/xfs/Makefile        |    1 
 fs/xfs/xfs_health.c    |    1 
 fs/xfs/xfs_healthmon.c |  262 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_mount.c     |    2 
 8 files changed, 317 insertions(+)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_healthmon.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 12463ba766da05..c58e55b3df4099 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,6 +1003,12 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+struct xfs_health_monitor {
+	__u64	flags;		/* flags */
+	__u8	format;		/* output format */
+	__u8	pad[23];	/* zeroes */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1042,6 +1048,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
+#define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
new file mode 100644
index 00000000000000..218d5aac87b012
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_HEALTHMON_H__
+#define __XFS_HEALTHMON_H__
+
+struct xfs_healthmon {
+	/*
+	 * Weak reference to the xfs filesystem that is being monitored.  It
+	 * will be set to zero when the filesystem detaches from the monitor.
+	 * Do not dereference this pointer.
+	 */
+	uintptr_t			mount_cookie;
+
+	/*
+	 * Device number of the filesystem being monitored.  This is for
+	 * consistent tracing even after unmount.
+	 */
+	dev_t				dev;
+
+	/*
+	 * Reference count of this structure.  The open healthmon fd holds one
+	 * ref, the xfs_mount holds another ref if it points to this object,
+	 * and running event handlers hold their own refs.
+	 */
+	refcount_t			ref;
+};
+
+void xfs_healthmon_unmount(struct xfs_mount *mp);
+
+long xfs_ioc_health_monitor(struct file *file,
+		struct xfs_health_monitor __user *arg);
+
+#endif /* __XFS_HEALTHMON_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b871dfde372b52..61c71128d171cb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_healthmon;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
@@ -342,6 +343,9 @@ typedef struct xfs_mount {
 
 	/* Hook to feed dirent updates to an active online repair. */
 	struct xfs_hooks	m_dir_update_hooks;
+
+	/* Private data referring to a health monitor object. */
+	struct xfs_healthmon	*m_healthmon;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bf501cf827172..1b7385e23b3463 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_handle.o \
 				   xfs_health.o \
+				   xfs_healthmon.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index fbb8886c72fe5e..3d50397f8f7c00 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
new file mode 100644
index 00000000000000..b7095ea55897c5
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/eventpoll.h>
+#include <linux/poll.h>
+
+/*
+ * Live Health Monitoring
+ * ======================
+ *
+ * Autonomous self-healing of XFS filesystems requires a means for the kernel
+ * to send filesystem health events to a monitoring daemon in userspace.  To
+ * accomplish this, we establish a thread_with_file kthread object to handle
+ * translating internal events about filesystem health into a format that can
+ * be parsed easily by userspace.  When those internal events occur, the core
+ * filesystem code calls this health monitor to convey the events to userspace.
+ * Userspace reads events from the file descriptor returned by the ioctl.
+ *
+ * The healthmon abstraction has a weak reference to the host filesystem mount
+ * so that the queueing and processing of the events do not pin the mount and
+ * cannot slow down the main filesystem.  The healthmon object can exist past
+ * the end of the filesystem mount.
+ */
+
+/* sign of a detached health monitor */
+#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
+
+/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+static DEFINE_SPINLOCK(xfs_healthmon_lock);
+
+/* Grab a reference to the healthmon object for a given mount, if any. */
+static struct xfs_healthmon *
+xfs_healthmon_get(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm;
+
+	rcu_read_lock();
+	hm = mp->m_healthmon;
+	if (hm && !refcount_inc_not_zero(&hm->ref))
+		hm = NULL;
+	rcu_read_unlock();
+
+	return hm;
+}
+
+/*
+ * Release the reference to a healthmon object.  If there are no more holders,
+ * free the health monitor after an RCU grace period to eliminate possibility
+ * of races with xfs_healthmon_get.
+ */
+static void
+xfs_healthmon_put(
+	struct xfs_healthmon		*hm)
+{
+	if (refcount_dec_and_test(&hm->ref))
+		kfree_rcu_mightsleep(hm);
+}
+
+/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+STATIC int
+xfs_healthmon_attach(
+	struct xfs_mount	*mp,
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (mp->m_healthmon != NULL) {
+		spin_unlock(&xfs_healthmon_lock);
+		return -EEXIST;
+	}
+
+	refcount_inc(&hm->ref);
+	mp->m_healthmon = hm;
+	hm->mount_cookie = (uintptr_t)mp->m_super;
+	spin_unlock(&xfs_healthmon_lock);
+
+	return 0;
+}
+
+/* Detach a xfs mount from a specific healthmon instance. */
+STATIC void
+xfs_healthmon_detach(
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+		spin_unlock(&xfs_healthmon_lock);
+		return;
+	}
+
+	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+	spin_unlock(&xfs_healthmon_lock);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Detach the xfs mount from this healthmon instance. */
+void
+xfs_healthmon_unmount(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	xfs_healthmon_detach(hm);
+	xfs_healthmon_put(hm);
+}
+
+STATIC ssize_t
+xfs_healthmon_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	return -EIO;
+}
+
+/* Free the health monitoring information. */
+STATIC int
+xfs_healthmon_release(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	/*
+	 * We might be closing the healthmon file before the filesystem
+	 * unmounts, because userspace processes can terminate at any time and
+	 * for any reason.  Null out xfs_mount::m_healthmon so that another
+	 * process can create another health monitor file.
+	 */
+	xfs_healthmon_detach(hm);
+
+	xfs_healthmon_put(hm);
+	return 0;
+}
+
+/* Validate ioctl parameters. */
+static inline bool
+xfs_healthmon_validate(
+	const struct xfs_health_monitor	*hmo)
+{
+	if (hmo->flags)
+		return false;
+	if (hmo->format)
+		return false;
+	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+		return false;
+	return true;
+}
+
+/* Emit some data about the health monitoring fd. */
+static void
+xfs_healthmon_show_fdinfo(
+	struct seq_file		*m,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+				"dead" : "alive",
+			MAJOR(hm->dev), MINOR(hm->dev));
+}
+
+static const struct file_operations xfs_healthmon_fops = {
+	.owner		= THIS_MODULE,
+	.show_fdinfo	= xfs_healthmon_show_fdinfo,
+	.read_iter	= xfs_healthmon_read_iter,
+	.release	= xfs_healthmon_release,
+};
+
+/*
+ * Create a health monitoring file.  Returns an index to the fd table or a
+ * negative errno.
+ */
+long
+xfs_ioc_health_monitor(
+	struct file			*file,
+	struct xfs_health_monitor __user *arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				ret;
+
+	/*
+	 * The only intended user of the health monitoring system should be the
+	 * xfs_healer daemon running on behalf of the whole filesystem in the
+	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
+	 * (they can use fsnotify) nor do we allow containers.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (ip->i_ino != mp->m_sb.sb_rootino)
+		return -EPERM;
+	if (current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+	if (!hm)
+		return -ENOMEM;
+	hm->dev = mp->m_super->s_dev;
+	refcount_set(&hm->ref, 1);
+
+	/*
+	 * Try to attach this health monitor to the xfs_mount.  The monitor is
+	 * considered live and will receive events if this succeeds.
+	 */
+	ret = xfs_healthmon_attach(mp, hm);
+	if (ret)
+		goto out_hm;
+
+	/*
+	 * Create the anonymous file and install a fd for it.  If it succeeds,
+	 * the file owns hm and can go away at any time, so we must not access
+	 * it again.  This must go last because we can't undo a fd table
+	 * installation.
+	 */
+	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+			O_CLOEXEC | O_RDONLY);
+	if (ret < 0)
+		goto out_mp;
+
+	return ret;
+
+out_mp:
+	xfs_healthmon_detach(hm);
+out_hm:
+	ASSERT(refcount_read(&hm->ref) == 1);
+	xfs_healthmon_put(hm);
+	return ret;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad77437181..c04c41ca924e37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1419,6 +1420,9 @@ xfs_file_ioctl(
 	case XFS_IOC_COMMIT_RANGE:
 		return xfs_ioc_commit_range(filp, arg);
 
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_ioc_health_monitor(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0953f6ae94abc8..ab67c91915384c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -625,6 +626,7 @@ xfs_unmount_flush_inodes(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
+	xfs_healthmon_unmount(mp);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-13  0:33 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
@ 2026-01-13 16:03   ` Christoph Hellwig
  0 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-13 16:03 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-fsdevel, linux-xfs

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCHSET v6] xfs: autonomous self healing of filesystems
@ 2026-01-16  5:42 Darrick J. Wong
  2026-01-16  5:42 ` [PATCH 01/11] docs: discuss autonomous self healing in the xfs online repair design doc Darrick J. Wong
                   ` (10 more replies)
  0 siblings, 11 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

Hi all,

This patchset builds new functionality to deliver live information about
filesystem health events to userspace.  This is done by creating an
anonymous file that can be read() for events by userspace programs.
Events are captured by hooking various parts of XFS and iomap so that
metadata health failures, file I/O errors, and major changes in
filesystem state (unmounts, shutdowns, etc.) can be observed by
programs.

When an event occurs, the hook functions queue an event object to each
event anonfd for later processing.  Programs must have CAP_SYS_ADMIN
to open the anonfd and there's a maximum event lag to prevent resource
overconsumption.  The events themselves can be read() from the anonfd
as C structs for the xfs_healer daemon.

In userspace, we create a new daemon program that will read the event
objects and initiate repairs automatically.  This daemon is managed
entirely by systemd and will not block unmounting of the filesystem
unless repairs are ongoing.  They are auto-started by a starter
service that uses fanotify.

This patchset depends on the new fserror code that Christian Brauner
has tentatively accepted for Linux 7.0:
https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs-7.0.fserror

v6: fix pi-breaking bugs, make verify failures trigger health reports
    and filter bio status flags better
v5: add verify-media ioctl, collapse small helper funcs with only
    one caller
v4: drop multiple client support so we can make direct calls into
    healthmon instead of chasing pointers and doing indirect calls
v3: drag out of rfc status

If you're going to start using this code, I strongly recommend pulling
from my git trees, which are linked below.

With a bit of luck, this should all go splendidly.
Comments and questions are, as always, welcome.

--D

Unreviewed patches in this series:
  [PATCH 04/11] xfs: convey filesystem unmount events to the health
  [PATCH 06/11] xfs: convey filesystem shutdown events to the health
  [PATCH 11/11] xfs: add media verification ioctl

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=health-monitoring

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=health-monitoring

fstests git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfstests-dev.git/log/?h=health-monitoring
---
Commits in this patchset:
 * docs: discuss autonomous self healing in the xfs online repair design doc
 * xfs: start creating infrastructure for health monitoring
 * xfs: create event queuing, formatting, and discovery infrastructure
 * xfs: convey filesystem unmount events to the health monitor
 * xfs: convey metadata health events to the health monitor
 * xfs: convey filesystem shutdown events to the health monitor
 * xfs: convey externally discovered fsdax media errors to the health monitor
 * xfs: convey file I/O errors to the health monitor
 * xfs: allow toggling verbose logging on the health monitoring file
 * xfs: check if an open file is on the health monitored fs
 * xfs: add media verification ioctl
---
 fs/xfs/libxfs/xfs_fs.h                             |  189 +++
 fs/xfs/libxfs/xfs_health.h                         |    5 
 fs/xfs/xfs_healthmon.h                             |  184 +++
 fs/xfs/xfs_mount.h                                 |    4 
 fs/xfs/xfs_trace.h                                 |  512 ++++++++
 fs/xfs/xfs_verify_media.h                          |   13 
 .../filesystems/xfs/xfs-online-fsck-design.rst     |  153 ++
 fs/xfs/Makefile                                    |    2 
 fs/xfs/xfs_fsops.c                                 |    2 
 fs/xfs/xfs_health.c                                |  124 ++
 fs/xfs/xfs_healthmon.c                             | 1255 ++++++++++++++++++++
 fs/xfs/xfs_ioctl.c                                 |    7 
 fs/xfs/xfs_mount.c                                 |    2 
 fs/xfs/xfs_notify_failure.c                        |   17 
 fs/xfs/xfs_super.c                                 |   12 
 fs/xfs/xfs_trace.c                                 |    5 
 fs/xfs/xfs_verify_media.c                          |  459 +++++++
 17 files changed, 2938 insertions(+), 7 deletions(-)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_verify_media.h
 create mode 100644 fs/xfs/xfs_healthmon.c
 create mode 100644 fs/xfs/xfs_verify_media.c


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 01/11] docs: discuss autonomous self healing in the xfs online repair design doc
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
@ 2026-01-16  5:42 ` Darrick J. Wong
  2026-01-16  5:42 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Update the XFS online repair document to describe the motivation and
design of the autonomous filesystem healing agent known as xfs_healer.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 .../filesystems/xfs/xfs-online-fsck-design.rst     |  153 ++++++++++++++++++++
 1 file changed, 151 insertions(+), 2 deletions(-)


diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
index 3d9233f403dbb1..fd936d1b7a32a2 100644
--- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
+++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
@@ -166,9 +166,12 @@ The current XFS tools leave several problems unsolved:
    malicious actors **exploit quirks of Unicode** to place misleading names
    in directories.
 
+8. **Site Reliability and Support Engineers** would like to reduce the
+   frequency of incidents requiring **manual intervention**.
+
 Given this definition of the problems to be solved and the actors who would
 benefit, the proposed solution is a third fsck tool that acts on a running
-filesystem.
+filesystem, and an autononmous agent that fixes problems as they arise.
 
 This new third program has three components: an in-kernel facility to check
 metadata, an in-kernel facility to repair metadata, and a userspace driver
@@ -203,6 +206,13 @@ Even if a piece of filesystem metadata can only be regenerated by scanning the
 entire system, the scan can still be done in the background while other file
 operations continue.
 
+The autonomous self healing agent should listen for metadata health impact
+reports coming from the kernel and automatically schedule repairs for the
+damaged metadata.
+If the required repairs are larger in scope than a single metadata structure,
+``xfs_scrub`` should be invoked to perform a full analysis.
+``xfs_healer`` is the name of this program.
+
 In summary, online fsck takes advantage of resource sharding and redundant
 metadata to enable targeted checking and repair operations while the system
 is running.
@@ -850,11 +860,16 @@ variable in the following service files:
 * ``xfs_scrub_all_fail.service``
 
 The decision to enable the background scan is left to the system administrator.
-This can be done by enabling either of the following services:
+This can be done system-wide by enabling either of the following services:
 
 * ``xfs_scrub_all.timer`` on systemd systems
 * ``xfs_scrub_all.cron`` on non-systemd systems
 
+To enable online repair for specific filesystems, the ``autofsck``
+filesystem property should be set to ``repair``.
+To enable only scanning, the property should be set to ``check``.
+To disable online fsck entirely, the property should be set to ``none``.
+
 This automatic weekly scan is configured out of the box to perform an
 additional media scan of all file data once per month.
 This is less foolproof than, say, storing file data block checksums, but much
@@ -897,6 +912,36 @@ notifications and initiate a repair?
 *Answer*: These questions remain unanswered, but should be a part of the
 conversation with early adopters and potential downstream users of XFS.
 
+Autonomous Self Healing
+-----------------------
+
+The autonomous self healing agent is a background system service that starts
+when the filesystem is mounted and runs until unmount.
+When starting up, the agent opens a special pseudofile under the specific
+mount.
+When the filesystem generates new adverse health events, the events will be
+made available for reading via the special pseudofile.
+The events need not be limited to metadata concerns; they can also reflect
+events outside of the filesystem's direct control such as file I/O errors.
+
+The agent reads these events in a loop and responds to the events
+appropriately.
+For a single trouble report about metadata, the agent initiates a targeted
+repair of the specific structure.
+If that repair fails or the agent observes too many metadata trouble reports
+over a short interval, it should then initiate a full scan of the filesystem
+via the ``xfs_scrub`` service.
+
+The decision to enable the background scan is left to the system administrator.
+This can be done system-wide by enabling the following services:
+
+* ``xfs_healer@.service`` on systemd systems
+
+To enable autonomous healing for specific filesystems, the ``autofsck``
+filesystem property should be set to ``repair``.
+To disable self healing, the property should be set to ``check``,
+``optimize``, or ``none``.
+
 5. Kernel Algorithms and Data Structures
 ========================================
 
@@ -4780,6 +4825,70 @@ Orphaned files are adopted by the orphanage as follows:
 7. If a runtime error happens, call ``xrep_adoption_cancel`` to release all
    resources.
 
+Health Monitoring
+-----------------
+
+A self-correcting filesystem responds to observations of problems by scheduling
+repairs of the affected areas.
+The filesystem must therefore create event objects in response to stimuli
+(metadata corruption, file I/O errors, etc.) and dispatch these events to
+downstream consumers.
+
+However, the decision to translate an adverse metadata health report into a
+repair should be made by userspace, and the actual scheduling done by userspace.
+Some users (e.g. containers) would prefer to fast-fail the container and restart
+it on another node at a previous checkpoint.
+For workloads running in isolation, repairs may be preferable; either way this
+is something the system administrator knows, and not the kernel.
+A userspace agent (``xfs_healer``, described later) will collect events from the
+kernel and dispatch them appropriately.
+
+Exporting health events to userspace requires the creation of a new component,
+known as the health monitor.
+Because the monitor exposes itself to userspace to deliver information, a file
+descriptor is the natural abstraction to use here.
+The health monitor hooks all the relevant sources of metadata health events.
+Upon activation of the hook, a new event object is created and added to a queue.
+When the agent reads from the fd, event objects are pulled from the start of the
+queue and formatted into the user's buffer.
+The events are freed, and the read call returns to userspace to allow the agent
+to perform some work.
+Memory usage is constrained on a per-fd basis to prevent memory exhaustion; if
+an event must be discarded, a special "lost event" event is delivered to the
+agent.
+
+In short, health events are captured, queued, and eventually copied out to
+userspace for dispatching.
+
++----------------------------------------------------------------------+
+| **Sidebar**:                                                         |
++----------------------------------------------------------------------+
+| **Question**: Why use a pseudofile and not use existing notification |
+| methods such as fanotify?                                            |
+|                                                                      |
+| *Answer*: The pseudofile is a private filesystem interface only      |
+| available to processes with the CAP_SYS_ADMIN priviledge and the     |
+| ability to open the root directory of an XFS filesystem.             |
+| Using a pseudofile gives the kernel and ``xfs_healer`` the           |
+| flexibility to expose XFS-specific filesystem details to a special   |
+| userspace daemon without cluttering up fanotify's userspace ABI.     |
+| Normal userpace programs are not expected to subscribe to the XFS    |
+| events in this manner.                                               |
+| Instead, they should subscribe to the generic events provided by     |
+| fanotify.                                                            |
+|                                                                      |
+| The pseudofile can also accept ioctls, which gives the userspace     |
+| ``xfs_healer`` program a programmatic means to validate that prior   |
+| to a repair, its reopened mountpoint is actually the same filesystem |
+| that is being monitored.                                             |
+|                                                                      |
+| Finally, on an implementation level, fsnotify provides rather little |
+| in the way of an actual event queue implementation; it's really more |
+| of an event dispatcher.                                              |
+| This means there's little advantage in terms of the quantity of new  |
+| code added since we still have to write our own queuing discipline!  |
++----------------------------------------------------------------------+
+
 6. Userspace Algorithms and Data Structures
 ===========================================
 
@@ -5071,6 +5180,46 @@ and report what has been lost.
 For media errors in blocks owned by files, parent pointers can be used to
 construct file paths from inode numbers for user-friendly reporting.
 
+Autonomous Self Healing
+-----------------------
+
+When a filesystem mounts, the Linux kernel initiates a fsnotify event
+describing the mount point and the path to the data device.
+A separate ``xfs_healer_start`` systemd service listens for these mount
+events via fanotify, and starts a mountpoint-specific ``xfs_healer``
+service instance.
+The ``xfs_healer`` service opens the mountpoint and issues the
+XFS_IOC_HEALTH_MONITOR ioctl to open a special health monitoring file.
+After that is set up, the mountpoint is closed to avoid pinning the mount.
+
+The health monitoring file hooks certain points of the filesystem so that it
+may receive events about metadata health, filesystem shutdowns, media errors,
+file I/O errors, and unmounting of the filesystem.
+Events are queued up for each health monitor file and encoded into a
+``struct xfs_health_monitor_event`` object when the agent calls ``read()`` on
+the file.
+All health events are dispatched to a background threadpool to reduce stalls
+in the main event loop.
+Events can be logged into the system log for further analysis.
+
+For metadata health events, the specific details are used to construct a call
+to the scrub ioctl.
+The filesystem mountpoint is reopened, and the kernel is called.
+If events are lost or the repairs fail, a full scan will be initiated by
+starting up an ``xfs_scrub@.service`` for the given mountpoint.
+
+A filesystem shutdown causes all future repair work to cease, and an unmount
+causes the agent to exit.
+
+**Future Work Question**: Should the healer daemon also register a dbus
+listener and publish events there?
+
+*Answer*: This is unclear -- if there's a demand for system monitoring daemons
+to consume this information and make decisions, then yes, this could be wired
+up in ``xfs_healer``.
+On the other hand, systemd is in the middle of a transition to varlink, so
+it makes more sense to wait and see what happens.
+
 7. Conclusion and Future Work
 =============================
 


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
  2026-01-16  5:42 ` [PATCH 01/11] docs: discuss autonomous self healing in the xfs online repair design doc Darrick J. Wong
@ 2026-01-16  5:42 ` Darrick J. Wong
  2026-02-06 13:07   ` Pankaj Raghav (Samsung)
  2026-01-16  5:42 ` [PATCH 03/11] xfs: create event queuing, formatting, and discovery infrastructure Darrick J. Wong
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Start creating helper functions and infrastructure to pass filesystem
health events to a health monitoring file.  Since this is an
administrative interface, we only support a single health monitor
process per filesystem, so we don't need to use anything fancy such as
notifier chains (== tons of indirect calls).

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |    7 +
 fs/xfs/xfs_healthmon.h |   36 +++++++
 fs/xfs/xfs_mount.h     |    4 +
 fs/xfs/Makefile        |    1 
 fs/xfs/xfs_health.c    |    1 
 fs/xfs/xfs_healthmon.c |  262 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_mount.c     |    2 
 8 files changed, 317 insertions(+)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_healthmon.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 12463ba766da05..c58e55b3df4099 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,6 +1003,12 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+struct xfs_health_monitor {
+	__u64	flags;		/* flags */
+	__u8	format;		/* output format */
+	__u8	pad[23];	/* zeroes */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1042,6 +1048,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
+#define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
new file mode 100644
index 00000000000000..218d5aac87b012
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_HEALTHMON_H__
+#define __XFS_HEALTHMON_H__
+
+struct xfs_healthmon {
+	/*
+	 * Weak reference to the xfs filesystem that is being monitored.  It
+	 * will be set to zero when the filesystem detaches from the monitor.
+	 * Do not dereference this pointer.
+	 */
+	uintptr_t			mount_cookie;
+
+	/*
+	 * Device number of the filesystem being monitored.  This is for
+	 * consistent tracing even after unmount.
+	 */
+	dev_t				dev;
+
+	/*
+	 * Reference count of this structure.  The open healthmon fd holds one
+	 * ref, the xfs_mount holds another ref if it points to this object,
+	 * and running event handlers hold their own refs.
+	 */
+	refcount_t			ref;
+};
+
+void xfs_healthmon_unmount(struct xfs_mount *mp);
+
+long xfs_ioc_health_monitor(struct file *file,
+		struct xfs_health_monitor __user *arg);
+
+#endif /* __XFS_HEALTHMON_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b871dfde372b52..61c71128d171cb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_healthmon;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
@@ -342,6 +343,9 @@ typedef struct xfs_mount {
 
 	/* Hook to feed dirent updates to an active online repair. */
 	struct xfs_hooks	m_dir_update_hooks;
+
+	/* Private data referring to a health monitor object. */
+	struct xfs_healthmon	*m_healthmon;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bf501cf827172..1b7385e23b3463 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_handle.o \
 				   xfs_health.o \
+				   xfs_healthmon.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index fbb8886c72fe5e..3d50397f8f7c00 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
new file mode 100644
index 00000000000000..b7095ea55897c5
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/eventpoll.h>
+#include <linux/poll.h>
+
+/*
+ * Live Health Monitoring
+ * ======================
+ *
+ * Autonomous self-healing of XFS filesystems requires a means for the kernel
+ * to send filesystem health events to a monitoring daemon in userspace.  To
+ * accomplish this, we establish a thread_with_file kthread object to handle
+ * translating internal events about filesystem health into a format that can
+ * be parsed easily by userspace.  When those internal events occur, the core
+ * filesystem code calls this health monitor to convey the events to userspace.
+ * Userspace reads events from the file descriptor returned by the ioctl.
+ *
+ * The healthmon abstraction has a weak reference to the host filesystem mount
+ * so that the queueing and processing of the events do not pin the mount and
+ * cannot slow down the main filesystem.  The healthmon object can exist past
+ * the end of the filesystem mount.
+ */
+
+/* sign of a detached health monitor */
+#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
+
+/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+static DEFINE_SPINLOCK(xfs_healthmon_lock);
+
+/* Grab a reference to the healthmon object for a given mount, if any. */
+static struct xfs_healthmon *
+xfs_healthmon_get(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm;
+
+	rcu_read_lock();
+	hm = mp->m_healthmon;
+	if (hm && !refcount_inc_not_zero(&hm->ref))
+		hm = NULL;
+	rcu_read_unlock();
+
+	return hm;
+}
+
+/*
+ * Release the reference to a healthmon object.  If there are no more holders,
+ * free the health monitor after an RCU grace period to eliminate possibility
+ * of races with xfs_healthmon_get.
+ */
+static void
+xfs_healthmon_put(
+	struct xfs_healthmon		*hm)
+{
+	if (refcount_dec_and_test(&hm->ref))
+		kfree_rcu_mightsleep(hm);
+}
+
+/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+STATIC int
+xfs_healthmon_attach(
+	struct xfs_mount	*mp,
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (mp->m_healthmon != NULL) {
+		spin_unlock(&xfs_healthmon_lock);
+		return -EEXIST;
+	}
+
+	refcount_inc(&hm->ref);
+	mp->m_healthmon = hm;
+	hm->mount_cookie = (uintptr_t)mp->m_super;
+	spin_unlock(&xfs_healthmon_lock);
+
+	return 0;
+}
+
+/* Detach a xfs mount from a specific healthmon instance. */
+STATIC void
+xfs_healthmon_detach(
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+		spin_unlock(&xfs_healthmon_lock);
+		return;
+	}
+
+	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+	spin_unlock(&xfs_healthmon_lock);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Detach the xfs mount from this healthmon instance. */
+void
+xfs_healthmon_unmount(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	xfs_healthmon_detach(hm);
+	xfs_healthmon_put(hm);
+}
+
+STATIC ssize_t
+xfs_healthmon_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	return -EIO;
+}
+
+/* Free the health monitoring information. */
+STATIC int
+xfs_healthmon_release(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	/*
+	 * We might be closing the healthmon file before the filesystem
+	 * unmounts, because userspace processes can terminate at any time and
+	 * for any reason.  Null out xfs_mount::m_healthmon so that another
+	 * process can create another health monitor file.
+	 */
+	xfs_healthmon_detach(hm);
+
+	xfs_healthmon_put(hm);
+	return 0;
+}
+
+/* Validate ioctl parameters. */
+static inline bool
+xfs_healthmon_validate(
+	const struct xfs_health_monitor	*hmo)
+{
+	if (hmo->flags)
+		return false;
+	if (hmo->format)
+		return false;
+	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+		return false;
+	return true;
+}
+
+/* Emit some data about the health monitoring fd. */
+static void
+xfs_healthmon_show_fdinfo(
+	struct seq_file		*m,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+				"dead" : "alive",
+			MAJOR(hm->dev), MINOR(hm->dev));
+}
+
+static const struct file_operations xfs_healthmon_fops = {
+	.owner		= THIS_MODULE,
+	.show_fdinfo	= xfs_healthmon_show_fdinfo,
+	.read_iter	= xfs_healthmon_read_iter,
+	.release	= xfs_healthmon_release,
+};
+
+/*
+ * Create a health monitoring file.  Returns an index to the fd table or a
+ * negative errno.
+ */
+long
+xfs_ioc_health_monitor(
+	struct file			*file,
+	struct xfs_health_monitor __user *arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				ret;
+
+	/*
+	 * The only intended user of the health monitoring system should be the
+	 * xfs_healer daemon running on behalf of the whole filesystem in the
+	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
+	 * (they can use fsnotify) nor do we allow containers.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (ip->i_ino != mp->m_sb.sb_rootino)
+		return -EPERM;
+	if (current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+	if (!hm)
+		return -ENOMEM;
+	hm->dev = mp->m_super->s_dev;
+	refcount_set(&hm->ref, 1);
+
+	/*
+	 * Try to attach this health monitor to the xfs_mount.  The monitor is
+	 * considered live and will receive events if this succeeds.
+	 */
+	ret = xfs_healthmon_attach(mp, hm);
+	if (ret)
+		goto out_hm;
+
+	/*
+	 * Create the anonymous file and install a fd for it.  If it succeeds,
+	 * the file owns hm and can go away at any time, so we must not access
+	 * it again.  This must go last because we can't undo a fd table
+	 * installation.
+	 */
+	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+			O_CLOEXEC | O_RDONLY);
+	if (ret < 0)
+		goto out_mp;
+
+	return ret;
+
+out_mp:
+	xfs_healthmon_detach(hm);
+out_hm:
+	ASSERT(refcount_read(&hm->ref) == 1);
+	xfs_healthmon_put(hm);
+	return ret;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad77437181..c04c41ca924e37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1419,6 +1420,9 @@ xfs_file_ioctl(
 	case XFS_IOC_COMMIT_RANGE:
 		return xfs_ioc_commit_range(filp, arg);
 
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_ioc_health_monitor(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0953f6ae94abc8..ab67c91915384c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -625,6 +626,7 @@ xfs_unmount_flush_inodes(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
+	xfs_healthmon_unmount(mp);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 03/11] xfs: create event queuing, formatting, and discovery infrastructure
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
  2026-01-16  5:42 ` [PATCH 01/11] docs: discuss autonomous self healing in the xfs online repair design doc Darrick J. Wong
  2026-01-16  5:42 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
@ 2026-01-16  5:42 ` Darrick J. Wong
  2026-01-16  5:43 ` [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor Darrick J. Wong
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:42 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Create the basic infrastructure that we need to report health events to
userspace.  We need a compact form for recording critical information
about an event and queueing them; a means to notice that we've lost some
events; and a means to format the events into something that userspace
can handle.  Make the kernel export C structures via read().

In a previous iteration of this new subsystem, I wanted to explore data
exchange formats that are more flexible and easier for humans to read
than C structures.  The thought being that when we want to rev (or
worse, enlarge) the event format, it ought to be trivially easy to do
that in a way that doesn't break old userspace.

I looked at formats such as protobufs and capnproto.  These look really
nice in that extending the wire format is fairly easy, you can give it a
data schema and it generates the serialization code for you, handles
endianness problems, etc.  The huge downside is that neither support C
all that well.

Too hard, and didn't want to port either of those huge sprawling
libraries first to the kernel and then again to xfsprogs.  Then I
thought, how about JSON?  Javascript objects are human readable, the
kernel can emit json without much fuss (it's all just strings!) and
there are plenty of interpreters for python/rust/c/etc.

There's a proposed schema format for json, which means that xfs can
publish a description of the events that kernel will emit.  Userspace
consumers (e.g. xfsprogs/xfs_healer) can embed the same schema document
and use it to validate the incoming events from the kernel, which means
it can discard events that it doesn't understand, or garbage being
emitted due to bugs.

However, json has a huge crutch -- javascript is well known for its
vague definitions of what are numbers.  This makes expressing a large
number rather fraught, because the runtime is free to represent a number
in nearly any way it wants.  Stupider ones will truncate values to word
size, others will roll out doubles for uint52_t (yes, fifty-two) with
the resulting loss of precision.  Not good when you're dealing with
discrete units.

It just so happens that python's json library is smart enough to see a
sequence of digits and put them in a u64 (at least on x86_64/aarch64)
but an actual javascript interpreter (pasting into Firefox) isn't
necessarily so clever.

It turns out that none of the proposed json schemas were ever ratified
even in an open-consensus way, so json blobs are still just loosely
structured blobs.  The parsing in userspace was also noticeably slow and
memory-consumptive.

Hence only the C interface survives.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |   47 +++++
 fs/xfs/xfs_healthmon.h |   59 ++++++
 fs/xfs/xfs_trace.h     |  171 +++++++++++++++++
 fs/xfs/xfs_healthmon.c |  495 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trace.c     |    2 
 5 files changed, 768 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index c58e55b3df4099..22b86bc888de5a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,12 +1003,59 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+/* Health monitor event domains */
+
+/* affects the whole fs */
+#define XFS_HEALTH_MONITOR_DOMAIN_MOUNT		(0)
+
+/* Health monitor event types */
+
+/* status of the monitor itself */
+#define XFS_HEALTH_MONITOR_TYPE_RUNNING		(0)
+#define XFS_HEALTH_MONITOR_TYPE_LOST		(1)
+
+/* lost events */
+struct xfs_health_monitor_lost {
+	__u64	count;
+};
+
+struct xfs_health_monitor_event {
+	/* XFS_HEALTH_MONITOR_DOMAIN_* */
+	__u32	domain;
+
+	/* XFS_HEALTH_MONITOR_TYPE_* */
+	__u32	type;
+
+	/* Timestamp of the event, in nanoseconds since the Unix epoch */
+	__u64	time_ns;
+
+	/*
+	 * Details of the event.  The primary clients are written in python
+	 * and rust, so break this up because bindgen hates anonymous structs
+	 * and unions.
+	 */
+	union {
+		struct xfs_health_monitor_lost lost;
+	} e;
+
+	/* zeroes */
+	__u64	pad[2];
+};
+
 struct xfs_health_monitor {
 	__u64	flags;		/* flags */
 	__u8	format;		/* output format */
 	__u8	pad[23];	/* zeroes */
 };
 
+/* Return all health status events, not just deltas */
+#define XFS_HEALTH_MONITOR_VERBOSE	(1ULL << 0)
+
+#define XFS_HEALTH_MONITOR_ALL		(XFS_HEALTH_MONITOR_VERBOSE)
+
+/* Initial return format version */
+#define XFS_HEALTH_MONITOR_FMT_V0	(0)
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 218d5aac87b012..554ec62125449b 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -26,10 +26,69 @@ struct xfs_healthmon {
 	 * and running event handlers hold their own refs.
 	 */
 	refcount_t			ref;
+
+	/* lock for event list and event counters */
+	struct mutex			lock;
+
+	/* list of event objects */
+	struct xfs_healthmon_event	*first_event;
+	struct xfs_healthmon_event	*last_event;
+
+	/* number of events in the list */
+	unsigned int			events;
+
+	/* do we want all events? */
+	bool				verbose:1;
+
+	/* waiter so read/poll can sleep until the arrival of events */
+	struct wait_queue_head		wait;
+
+	/*
+	 * Buffer for formatting events for a read_iter call.  Events are
+	 * formatted into the buffer at bufhead, and buftail determines where
+	 * to start a copy_iter to get those events to userspace.  All buffer
+	 * fields are protected by inode_lock.
+	 */
+	char				*buffer;
+	size_t				bufsize;
+	size_t				bufhead;
+	size_t				buftail;
+
+	/* did we lose previous events? */
+	unsigned long long		lost_prev_event;
+
+	/* total counts of events observed and lost events */
+	unsigned long long		total_events;
+	unsigned long long		total_lost;
 };
 
 void xfs_healthmon_unmount(struct xfs_mount *mp);
 
+enum xfs_healthmon_type {
+	XFS_HEALTHMON_RUNNING,	/* monitor running */
+	XFS_HEALTHMON_LOST,	/* message lost */
+};
+
+enum xfs_healthmon_domain {
+	XFS_HEALTHMON_MOUNT,	/* affects the whole fs */
+};
+
+struct xfs_healthmon_event {
+	struct xfs_healthmon_event	*next;
+
+	enum xfs_healthmon_type		type;
+	enum xfs_healthmon_domain	domain;
+
+	uint64_t			time_ns;
+
+	union {
+		/* lost events */
+		struct {
+			uint64_t	lostcount;
+		};
+	};
+};
+
 long xfs_ioc_health_monitor(struct file *file,
 		struct xfs_health_monitor __user *arg);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f70afbf3cb196b..04727470b3b410 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -103,6 +103,8 @@ struct xfs_refcount_intent;
 struct xfs_metadir_update;
 struct xfs_rtgroup;
 struct xfs_open_zone;
+struct xfs_healthmon_event;
+struct xfs_healthmon;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -5906,6 +5908,175 @@ DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
 DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
 DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
 
+TRACE_EVENT(xfs_healthmon_lost_event,
+	TP_PROTO(const struct xfs_healthmon *hm),
+	TP_ARGS(hm),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long long, lost_prev)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->lost_prev = hm->lost_prev_event;
+	),
+	TP_printk("dev %d:%d lost_prev %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lost_prev)
+);
+
+#define XFS_HEALTHMON_FLAGS_STRINGS \
+	{ XFS_HEALTH_MONITOR_VERBOSE,	"verbose" }
+#define XFS_HEALTHMON_FMT_STRINGS \
+	{ XFS_HEALTH_MONITOR_FMT_V0,	"v0" }
+
+TRACE_EVENT(xfs_healthmon_create,
+	TP_PROTO(dev_t dev, u64 flags, u8 format),
+	TP_ARGS(dev, flags, format),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, flags)
+		__field(u8, format)
+	),
+	TP_fast_assign(
+		__entry->dev = dev;
+		__entry->flags = flags;
+		__entry->format = format;
+	),
+	TP_printk("dev %d:%d flags %s format %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_flags(__entry->flags, "|", XFS_HEALTHMON_FLAGS_STRINGS),
+		  __print_symbolic(__entry->format, XFS_HEALTHMON_FMT_STRINGS))
+);
+
+TRACE_EVENT(xfs_healthmon_copybuf,
+	TP_PROTO(const struct xfs_healthmon *hm, const struct iov_iter *iov),
+	TP_ARGS(hm, iov),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(size_t, bufsize)
+		__field(size_t, inpos)
+		__field(size_t, outpos)
+		__field(size_t, to_copy)
+		__field(size_t, iter_count)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->bufsize = hm->bufsize;
+		__entry->inpos = hm->bufhead;
+		__entry->outpos = hm->buftail;
+		if (hm->bufhead > hm->buftail)
+			__entry->to_copy = hm->bufhead - hm->buftail;
+		else
+			__entry->to_copy = 0;
+		__entry->iter_count = iov_iter_count(iov);
+	),
+	TP_printk("dev %d:%d bufsize %zu in_pos %zu out_pos %zu to_copy %zu iter_count %zu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->bufsize,
+		  __entry->inpos,
+		  __entry->outpos,
+		  __entry->to_copy,
+		  __entry->iter_count)
+);
+
+DECLARE_EVENT_CLASS(xfs_healthmon_class,
+	TP_PROTO(const struct xfs_healthmon *hm),
+	TP_ARGS(hm),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, events)
+		__field(unsigned long long, lost_prev)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->events = hm->events;
+		__entry->lost_prev = hm->lost_prev_event;
+	),
+	TP_printk("dev %d:%d events %u lost_prev? %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->events,
+		  __entry->lost_prev)
+);
+#define DEFINE_HEALTHMON_EVENT(name) \
+DEFINE_EVENT(xfs_healthmon_class, name, \
+	TP_PROTO(const struct xfs_healthmon *hm), \
+	TP_ARGS(hm))
+DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_start);
+DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_finish);
+DEFINE_HEALTHMON_EVENT(xfs_healthmon_release);
+DEFINE_HEALTHMON_EVENT(xfs_healthmon_detach);
+
+#define XFS_HEALTHMON_TYPE_STRINGS \
+	{ XFS_HEALTHMON_LOST,		"lost" }
+
+#define XFS_HEALTHMON_DOMAIN_STRINGS \
+	{ XFS_HEALTHMON_MOUNT,		"mount" }
+
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT);
+
+DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
+	TP_PROTO(const struct xfs_healthmon *hm,
+		 const struct xfs_healthmon_event *event),
+	TP_ARGS(hm, event),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned int, domain)
+		__field(unsigned int, mask)
+		__field(unsigned long long, ino)
+		__field(unsigned int, gen)
+		__field(unsigned int, group)
+		__field(unsigned long long, offset)
+		__field(unsigned long long, length)
+		__field(unsigned long long, lostcount)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->type = event->type;
+		__entry->domain = event->domain;
+		__entry->mask = 0;
+		__entry->group = 0;
+		__entry->ino = 0;
+		__entry->gen = 0;
+		__entry->offset = 0;
+		__entry->length = 0;
+		__entry->lostcount = 0;
+		switch (__entry->domain) {
+		case XFS_HEALTHMON_MOUNT:
+			switch (__entry->type) {
+			case XFS_HEALTHMON_LOST:
+				__entry->lostcount = event->lostcount;
+				break;
+			}
+			break;
+		}
+	),
+	TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
+		  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
+		  __entry->mask,
+		  __entry->ino,
+		  __entry->gen,
+		  __entry->offset,
+		  __entry->length,
+		  __entry->group,
+		  __entry->lostcount)
+);
+#define DEFINE_HEALTHMONEVENT_EVENT(name) \
+DEFINE_EVENT(xfs_healthmon_event_class, name, \
+	TP_PROTO(const struct xfs_healthmon *hm, \
+		 const struct xfs_healthmon_event *event), \
+	TP_ARGS(hm, event))
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_insert);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_push);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_pop);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format_overflow);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_drop);
+DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_merge);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index b7095ea55897c5..f1c6782f5e3915 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -45,6 +45,13 @@
 /* sign of a detached health monitor */
 #define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
 
+/* Constrain the number of event objects that can build up in memory. */
+#define XFS_HEALTHMON_MAX_EVENTS	(SZ_32K / \
+					 sizeof(struct xfs_healthmon_event))
+
+/* Constrain the size of the output buffer for read_iter. */
+#define XFS_HEALTHMON_MAX_OUTBUF	SZ_64K
+
 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
 static DEFINE_SPINLOCK(xfs_healthmon_lock);
 
@@ -73,8 +80,20 @@ static void
 xfs_healthmon_put(
 	struct xfs_healthmon		*hm)
 {
-	if (refcount_dec_and_test(&hm->ref))
+	if (refcount_dec_and_test(&hm->ref)) {
+		struct xfs_healthmon_event	*event;
+		struct xfs_healthmon_event	*next = hm->first_event;
+
+		while ((event = next) != NULL) {
+			trace_xfs_healthmon_drop(hm, event);
+			next = event->next;
+			kfree(event);
+		}
+
+		kfree(hm->buffer);
+		mutex_destroy(&hm->lock);
 		kfree_rcu_mightsleep(hm);
+	}
 }
 
 /* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
@@ -112,9 +131,182 @@ xfs_healthmon_detach(
 	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
 	spin_unlock(&xfs_healthmon_lock);
 
+	trace_xfs_healthmon_detach(hm);
 	xfs_healthmon_put(hm);
 }
 
+static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
+{
+	hm->events++;
+	hm->total_events++;
+}
+
+static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
+{
+	hm->lost_prev_event++;
+	hm->total_lost++;
+}
+
+/*
+ * If possible, merge a new event into an existing event.  Returns whether or
+ * not it merged anything.
+ */
+static bool
+xfs_healthmon_merge_events(
+	struct xfs_healthmon_event		*existing,
+	const struct xfs_healthmon_event	*new)
+{
+	if (!existing)
+		return false;
+
+	/* type and domain must match to merge events */
+	if (existing->type != new->type ||
+	    existing->domain != new->domain)
+		return false;
+
+	switch (existing->type) {
+	case XFS_HEALTHMON_RUNNING:
+		/* should only ever be one of these events anyway */
+		return false;
+
+	case XFS_HEALTHMON_LOST:
+		existing->lostcount += new->lostcount;
+		return true;
+	}
+
+	return false;
+}
+
+/* Insert an event onto the start of the queue. */
+static inline void
+__xfs_healthmon_insert(
+	struct xfs_healthmon		*hm,
+	struct xfs_healthmon_event	*event)
+{
+	struct timespec64		now;
+
+	ktime_get_coarse_real_ts64(&now);
+	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+
+	event->next = hm->first_event;
+	if (!hm->first_event)
+		hm->first_event = event;
+	if (!hm->last_event)
+		hm->last_event = event;
+	xfs_healthmon_bump_events(hm);
+	wake_up(&hm->wait);
+
+	trace_xfs_healthmon_insert(hm, event);
+}
+
+/* Push an event onto the end of the queue. */
+static inline void
+__xfs_healthmon_push(
+	struct xfs_healthmon		*hm,
+	struct xfs_healthmon_event	*event)
+{
+	struct timespec64		now;
+
+	ktime_get_coarse_real_ts64(&now);
+	event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
+
+	if (!hm->first_event)
+		hm->first_event = event;
+	if (hm->last_event)
+		hm->last_event->next = event;
+	hm->last_event = event;
+	event->next = NULL;
+	xfs_healthmon_bump_events(hm);
+	wake_up(&hm->wait);
+
+	trace_xfs_healthmon_push(hm, event);
+}
+
+/* Deal with any previously lost events */
+static int
+xfs_healthmon_clear_lost_prev(
+	struct xfs_healthmon		*hm)
+{
+	struct xfs_healthmon_event	lost_event = {
+		.type			= XFS_HEALTHMON_LOST,
+		.domain			= XFS_HEALTHMON_MOUNT,
+		.lostcount		= hm->lost_prev_event,
+	};
+	struct xfs_healthmon_event	*event = NULL;
+
+	if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
+		trace_xfs_healthmon_merge(hm, hm->last_event);
+		wake_up(&hm->wait);
+		goto cleared;
+	}
+
+	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+		event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
+				GFP_NOFS);
+	if (!event)
+		return -ENOMEM;
+
+	__xfs_healthmon_push(hm, event);
+cleared:
+	hm->lost_prev_event = 0;
+	return 0;
+}
+
+/*
+ * Push an event onto the end of the list after dealing with lost events and
+ * possibly full queues.
+ */
+STATIC int
+xfs_healthmon_push(
+	struct xfs_healthmon			*hm,
+	const struct xfs_healthmon_event	*template)
+{
+	struct xfs_healthmon_event		*event = NULL;
+	int					error = 0;
+
+	/*
+	 * Locklessly check if the health monitor has already detached from the
+	 * mount.  If so, ignore the event.  If we race with deactivation,
+	 * we'll queue the event but never send it.
+	 */
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+		return -ESHUTDOWN;
+
+	mutex_lock(&hm->lock);
+
+	/* Report previously lost events before we do anything else */
+	if (hm->lost_prev_event) {
+		error = xfs_healthmon_clear_lost_prev(hm);
+		if (error)
+			goto out_unlock;
+	}
+
+	/* Try to merge with the newest event */
+	if (xfs_healthmon_merge_events(hm->last_event, template)) {
+		trace_xfs_healthmon_merge(hm, hm->last_event);
+		wake_up(&hm->wait);
+		goto out_unlock;
+	}
+
+	/* Only create a heap event object if we're not already at capacity. */
+	if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
+		event = kmemdup(template, sizeof(struct xfs_healthmon_event),
+				GFP_NOFS);
+	if (!event) {
+		/* No memory means we lose the event */
+		trace_xfs_healthmon_lost_event(hm);
+		xfs_healthmon_bump_lost(hm);
+		error = -ENOMEM;
+		goto out_unlock;
+	}
+
+	__xfs_healthmon_push(hm, event);
+
+out_unlock:
+	mutex_unlock(&hm->lock);
+	return error;
+}
+
 /* Detach the xfs mount from this healthmon instance. */
 void
 xfs_healthmon_unmount(
@@ -129,12 +321,271 @@ xfs_healthmon_unmount(
 	xfs_healthmon_put(hm);
 }
 
+static inline void
+xfs_healthmon_reset_outbuf(
+	struct xfs_healthmon		*hm)
+{
+	hm->buftail = 0;
+	hm->bufhead = 0;
+}
+
+static const unsigned int domain_map[] = {
+	[XFS_HEALTHMON_MOUNT]		= XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
+};
+
+static const unsigned int type_map[] = {
+	[XFS_HEALTHMON_RUNNING]		= XFS_HEALTH_MONITOR_TYPE_RUNNING,
+	[XFS_HEALTHMON_LOST]		= XFS_HEALTH_MONITOR_TYPE_LOST,
+};
+
+/* Render event as a V0 structure */
+STATIC int
+xfs_healthmon_format_v0(
+	struct xfs_healthmon		*hm,
+	const struct xfs_healthmon_event *event)
+{
+	struct xfs_health_monitor_event	hme = {
+		.time_ns		= event->time_ns,
+	};
+
+	trace_xfs_healthmon_format(hm, event);
+
+	if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
+	    event->type < 0   || event->type >= ARRAY_SIZE(type_map))
+		return -EFSCORRUPTED;
+
+	hme.domain = domain_map[event->domain];
+	hme.type = type_map[event->type];
+
+	/* fill in the event-specific details */
+	switch (event->domain) {
+	case XFS_HEALTHMON_MOUNT:
+		switch (event->type) {
+		case XFS_HEALTHMON_LOST:
+			hme.e.lost.count = event->lostcount;
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
+
+	/* copy formatted object to the outbuf */
+	if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
+		memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
+		hm->bufhead += sizeof(hme);
+	}
+
+	return 0;
+}
+
+/* How many bytes are waiting in the outbuf to be copied? */
+static inline size_t
+xfs_healthmon_outbuf_bytes(
+	struct xfs_healthmon	*hm)
+{
+	if (hm->bufhead > hm->buftail)
+		return hm->bufhead - hm->buftail;
+	return 0;
+}
+
+/*
+ * Do we have something for userspace to read?  This can mean unmount events,
+ * events pending in the queue, or pending bytes in the outbuf.
+ */
+static inline bool
+xfs_healthmon_has_eventdata(
+	struct xfs_healthmon	*hm)
+{
+	/*
+	 * If the health monitor is already detached from the xfs_mount, we
+	 * want reads to return 0 bytes even if there are no events, because
+	 * userspace interprets that as EOF.  If we race with deactivation,
+	 * read_iter will take the necessary locks to discover that there are
+	 * no events to send.
+	 */
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
+		return true;
+
+	/*
+	 * Either there are events waiting to be formatted into the buffer, or
+	 * there's unread bytes in the buffer.
+	 */
+	return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
+}
+
+/* Try to copy the rest of the outbuf to the iov iter. */
+STATIC ssize_t
+xfs_healthmon_copybuf(
+	struct xfs_healthmon	*hm,
+	struct iov_iter		*to)
+{
+	size_t			to_copy;
+	size_t			w = 0;
+
+	trace_xfs_healthmon_copybuf(hm, to);
+
+	to_copy = xfs_healthmon_outbuf_bytes(hm);
+	if (to_copy) {
+		w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
+		if (!w)
+			return -EFAULT;
+
+		hm->buftail += w;
+	}
+
+	/*
+	 * Nothing left to copy?  Reset the output buffer cursors to the start
+	 * since there's no live data in the buffer.
+	 */
+	if (xfs_healthmon_outbuf_bytes(hm) == 0)
+		xfs_healthmon_reset_outbuf(hm);
+	return w;
+}
+
+/*
+ * Return a health monitoring event for formatting into the output buffer if
+ * there's enough space in the outbuf and an event waiting for us.  Caller
+ * must hold i_rwsem on the healthmon file.
+ */
+static inline struct xfs_healthmon_event *
+xfs_healthmon_format_pop(
+	struct xfs_healthmon	*hm)
+{
+	struct xfs_healthmon_event *event;
+
+	if (hm->bufhead + sizeof(*event) > hm->bufsize)
+		return NULL;
+
+	mutex_lock(&hm->lock);
+	event = hm->first_event;
+	if (event) {
+		if (hm->last_event == event)
+			hm->last_event = NULL;
+		hm->first_event = event->next;
+		hm->events--;
+
+		trace_xfs_healthmon_pop(hm, event);
+	}
+	mutex_unlock(&hm->lock);
+	return event;
+}
+
+/* Allocate formatting buffer */
+STATIC int
+xfs_healthmon_alloc_outbuf(
+	struct xfs_healthmon	*hm,
+	size_t			user_bufsize)
+{
+	void			*outbuf;
+	size_t			bufsize =
+		min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
+
+	outbuf = kzalloc(bufsize, GFP_KERNEL);
+	if (!outbuf) {
+		if (bufsize == PAGE_SIZE)
+			return -ENOMEM;
+
+		bufsize = PAGE_SIZE;
+		outbuf = kzalloc(bufsize, GFP_KERNEL);
+		if (!outbuf)
+			return -ENOMEM;
+	}
+
+	hm->buffer = outbuf;
+	hm->bufsize = bufsize;
+	hm->bufhead = 0;
+	hm->buftail = 0;
+
+	return 0;
+}
+
+/*
+ * Convey queued event data to userspace.  First copy any remaining bytes in
+ * the outbuf, then format the oldest event into the outbuf and copy that too.
+ */
 STATIC ssize_t
 xfs_healthmon_read_iter(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
-	return -EIO;
+	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file_inode(file);
+	struct xfs_healthmon	*hm = file->private_data;
+	struct xfs_healthmon_event *event;
+	size_t			copied = 0;
+	ssize_t			ret = 0;
+
+	if (file->f_flags & O_NONBLOCK) {
+		if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
+			return -EAGAIN;
+	} else {
+		ret = wait_event_interruptible(hm->wait,
+				xfs_healthmon_has_eventdata(hm));
+		if (ret)
+			return ret;
+
+		inode_lock(inode);
+	}
+
+	if (hm->bufsize == 0) {
+		ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
+		if (ret)
+			goto out_unlock;
+	}
+
+	trace_xfs_healthmon_read_start(hm);
+
+	/*
+	 * If there's anything left in the output buffer, copy that before
+	 * formatting more events.
+	 */
+	ret = xfs_healthmon_copybuf(hm, to);
+	if (ret < 0)
+		goto out_unlock;
+	copied += ret;
+
+	while (iov_iter_count(to) > 0) {
+		/* Format the next events into the outbuf until it's full. */
+		while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
+			ret = xfs_healthmon_format_v0(hm, event);
+			kfree(event);
+			if (ret)
+				goto out_unlock;
+		}
+
+		/* Copy anything formatted into outbuf to userspace */
+		ret = xfs_healthmon_copybuf(hm, to);
+		if (ret <= 0)
+			break;
+
+		copied += ret;
+	}
+
+out_unlock:
+	trace_xfs_healthmon_read_finish(hm);
+	inode_unlock(inode);
+	return copied ?: ret;
+}
+
+/* Poll for available events. */
+STATIC __poll_t
+xfs_healthmon_poll(
+	struct file			*file,
+	struct poll_table_struct	*wait)
+{
+	struct xfs_healthmon		*hm = file->private_data;
+	__poll_t			mask = 0;
+
+	poll_wait(file, &hm->wait, wait);
+
+	if (xfs_healthmon_has_eventdata(hm))
+		mask |= EPOLLIN;
+	return mask;
 }
 
 /* Free the health monitoring information. */
@@ -145,6 +596,8 @@ xfs_healthmon_release(
 {
 	struct xfs_healthmon	*hm = file->private_data;
 
+	trace_xfs_healthmon_release(hm);
+
 	/*
 	 * We might be closing the healthmon file before the filesystem
 	 * unmounts, because userspace processes can terminate at any time and
@@ -153,6 +606,12 @@ xfs_healthmon_release(
 	 */
 	xfs_healthmon_detach(hm);
 
+	/*
+	 * Wake up any readers that might be left.  There shouldn't be any
+	 * because the only users of the waiter are read and poll.
+	 */
+	wake_up_all(&hm->wait);
+
 	xfs_healthmon_put(hm);
 	return 0;
 }
@@ -162,9 +621,9 @@ static inline bool
 xfs_healthmon_validate(
 	const struct xfs_health_monitor	*hmo)
 {
-	if (hmo->flags)
+	if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
 		return false;
-	if (hmo->format)
+	if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
 		return false;
 	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
 		return false;
@@ -179,16 +638,21 @@ xfs_healthmon_show_fdinfo(
 {
 	struct xfs_healthmon	*hm = file->private_data;
 
-	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+	mutex_lock(&hm->lock);
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
 			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
 				"dead" : "alive",
-			MAJOR(hm->dev), MINOR(hm->dev));
+			MAJOR(hm->dev), MINOR(hm->dev),
+			hm->total_events,
+			hm->total_lost);
+	mutex_unlock(&hm->lock);
 }
 
 static const struct file_operations xfs_healthmon_fops = {
 	.owner		= THIS_MODULE,
 	.show_fdinfo	= xfs_healthmon_show_fdinfo,
 	.read_iter	= xfs_healthmon_read_iter,
+	.poll		= xfs_healthmon_poll,
 	.release	= xfs_healthmon_release,
 };
 
@@ -202,6 +666,7 @@ xfs_ioc_health_monitor(
 	struct xfs_health_monitor __user *arg)
 {
 	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon_event	*running_event;
 	struct xfs_healthmon		*hm;
 	struct xfs_inode		*ip = XFS_I(file_inode(file));
 	struct xfs_mount		*mp = ip->i_mount;
@@ -232,6 +697,22 @@ xfs_ioc_health_monitor(
 	hm->dev = mp->m_super->s_dev;
 	refcount_set(&hm->ref, 1);
 
+	mutex_init(&hm->lock);
+	init_waitqueue_head(&hm->wait);
+
+	if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
+		hm->verbose = true;
+
+	/* Queue up the first event that lets the client know we're running. */
+	running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS);
+	if (!running_event) {
+		ret = -ENOMEM;
+		goto out_hm;
+	}
+	running_event->type = XFS_HEALTHMON_RUNNING;
+	running_event->domain = XFS_HEALTHMON_MOUNT;
+	__xfs_healthmon_insert(hm, running_event);
+
 	/*
 	 * Try to attach this health monitor to the xfs_mount.  The monitor is
 	 * considered live and will receive events if this succeeds.
@@ -251,6 +732,8 @@ xfs_ioc_health_monitor(
 	if (ret < 0)
 		goto out_mp;
 
+	trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
+
 	return ret;
 
 out_mp:
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index a60556dbd172ee..d42b864a3837a2 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -51,6 +51,8 @@
 #include "xfs_rtgroup.h"
 #include "xfs_zone_alloc.h"
 #include "xfs_zone_priv.h"
+#include "xfs_health.h"
+#include "xfs_healthmon.h"
 
 /*
  * We include this last to have the helpers above available for the trace


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (2 preceding siblings ...)
  2026-01-16  5:42 ` [PATCH 03/11] xfs: create event queuing, formatting, and discovery infrastructure Darrick J. Wong
@ 2026-01-16  5:43 ` Darrick J. Wong
  2026-01-19 15:44   ` Christoph Hellwig
  2026-01-16  5:43 ` [PATCH 05/11] xfs: convey metadata health " Darrick J. Wong
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

In xfs_healthmon_unmount, send events to xfs_healer so that it knows
that nothing further can be done for the filesystem.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |    3 +++
 fs/xfs/xfs_healthmon.h |    4 ++++
 fs/xfs/xfs_trace.h     |    6 +++++-
 fs/xfs/xfs_healthmon.c |   32 +++++++++++++++++++++++++++++++-
 4 files changed, 43 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 22b86bc888de5a..59de6ab69fb319 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1014,6 +1014,9 @@ struct xfs_rtgroup_geometry {
 #define XFS_HEALTH_MONITOR_TYPE_RUNNING		(0)
 #define XFS_HEALTH_MONITOR_TYPE_LOST		(1)
 
+/* filesystem was unmounted */
+#define XFS_HEALTH_MONITOR_TYPE_UNMOUNT		(2)
+
 /* lost events */
 struct xfs_health_monitor_lost {
 	__u64	count;
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 554ec62125449b..3044bb46485d7e 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -34,6 +34,9 @@ struct xfs_healthmon {
 	struct xfs_healthmon_event	*first_event;
 	struct xfs_healthmon_event	*last_event;
 
+	/* preallocated event for unmount */
+	struct xfs_healthmon_event	*unmount_event;
+
 	/* number of events in the list */
 	unsigned int			events;
 
@@ -67,6 +70,7 @@ void xfs_healthmon_unmount(struct xfs_mount *mp);
 enum xfs_healthmon_type {
 	XFS_HEALTHMON_RUNNING,	/* monitor running */
 	XFS_HEALTHMON_LOST,	/* message lost */
+	XFS_HEALTHMON_UNMOUNT,	/* filesystem is unmounting */
 };
 
 enum xfs_healthmon_domain {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 04727470b3b410..305cae8f497b43 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6005,14 +6005,18 @@ DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_start);
 DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_finish);
 DEFINE_HEALTHMON_EVENT(xfs_healthmon_release);
 DEFINE_HEALTHMON_EVENT(xfs_healthmon_detach);
+DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount);
 
 #define XFS_HEALTHMON_TYPE_STRINGS \
-	{ XFS_HEALTHMON_LOST,		"lost" }
+	{ XFS_HEALTHMON_LOST,		"lost" }, \
+	{ XFS_HEALTHMON_UNMOUNT,	"unmount" }
 
 #define XFS_HEALTHMON_DOMAIN_STRINGS \
 	{ XFS_HEALTHMON_MOUNT,		"mount" }
 
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT);
+
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT);
 
 DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index f1c6782f5e3915..c218838e6e59f4 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -90,6 +90,7 @@ xfs_healthmon_put(
 			kfree(event);
 		}
 
+		kfree(hm->unmount_event);
 		kfree(hm->buffer);
 		mutex_destroy(&hm->lock);
 		kfree_rcu_mightsleep(hm);
@@ -166,6 +167,7 @@ xfs_healthmon_merge_events(
 
 	switch (existing->type) {
 	case XFS_HEALTHMON_RUNNING:
+	case XFS_HEALTHMON_UNMOUNT:
 		/* should only ever be one of these events anyway */
 		return false;
 
@@ -307,7 +309,10 @@ xfs_healthmon_push(
 	return error;
 }
 
-/* Detach the xfs mount from this healthmon instance. */
+/*
+ * Report that the filesystem is being unmounted, then detach the xfs mount
+ * from this healthmon instance.
+ */
 void
 xfs_healthmon_unmount(
 	struct xfs_mount		*mp)
@@ -317,6 +322,17 @@ xfs_healthmon_unmount(
 	if (!hm)
 		return;
 
+	trace_xfs_healthmon_report_unmount(hm);
+
+	/*
+	 * Insert the unmount notification at the start of the event queue so
+	 * that userspace knows the filesystem went away as soon as possible.
+	 * There's nothing actionable for userspace after an unmount.  Once
+	 * we've inserted the unmount event, hm no longer owns that event.
+	 */
+	__xfs_healthmon_insert(hm, hm->unmount_event);
+	hm->unmount_event = NULL;
+
 	xfs_healthmon_detach(hm);
 	xfs_healthmon_put(hm);
 }
@@ -713,6 +729,20 @@ xfs_ioc_health_monitor(
 	running_event->domain = XFS_HEALTHMON_MOUNT;
 	__xfs_healthmon_insert(hm, running_event);
 
+	/*
+	 * Preallocate the unmount event so that we can't fail to notify the
+	 * filesystem later.  This is key for triggering fast exit of the
+	 * xfs_healer daemon.
+	 */
+	hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event),
+			GFP_NOFS);
+	if (!hm->unmount_event) {
+		ret = -ENOMEM;
+		goto out_hm;
+	}
+	hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
+	hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
+
 	/*
 	 * Try to attach this health monitor to the xfs_mount.  The monitor is
 	 * considered live and will receive events if this succeeds.


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 05/11] xfs: convey metadata health events to the health monitor
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (3 preceding siblings ...)
  2026-01-16  5:43 ` [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor Darrick J. Wong
@ 2026-01-16  5:43 ` Darrick J. Wong
  2026-01-16  5:43 ` [PATCH 06/11] xfs: convey filesystem shutdown " Darrick J. Wong
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Connect the filesystem metadata health event collection system to the
health monitor so that xfs can send events to xfs_healer as it collects
information.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h     |   35 +++++++++
 fs/xfs/libxfs/xfs_health.h |    5 +
 fs/xfs/xfs_healthmon.h     |   39 +++++++++
 fs/xfs/xfs_trace.h         |  130 +++++++++++++++++++++++++++++++-
 fs/xfs/xfs_health.c        |  123 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_healthmon.c     |  181 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 511 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 59de6ab69fb319..04e1dcf61257d0 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1008,6 +1008,12 @@ struct xfs_rtgroup_geometry {
 /* affects the whole fs */
 #define XFS_HEALTH_MONITOR_DOMAIN_MOUNT		(0)
 
+/* metadata health events */
+#define XFS_HEALTH_MONITOR_DOMAIN_FS		(1)
+#define XFS_HEALTH_MONITOR_DOMAIN_AG		(2)
+#define XFS_HEALTH_MONITOR_DOMAIN_INODE		(3)
+#define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP	(4)
+
 /* Health monitor event types */
 
 /* status of the monitor itself */
@@ -1017,11 +1023,37 @@ struct xfs_rtgroup_geometry {
 /* filesystem was unmounted */
 #define XFS_HEALTH_MONITOR_TYPE_UNMOUNT		(2)
 
+/* metadata health events */
+#define XFS_HEALTH_MONITOR_TYPE_SICK		(3)
+#define XFS_HEALTH_MONITOR_TYPE_CORRUPT		(4)
+#define XFS_HEALTH_MONITOR_TYPE_HEALTHY		(5)
+
 /* lost events */
 struct xfs_health_monitor_lost {
 	__u64	count;
 };
 
+/* fs/rt metadata */
+struct xfs_health_monitor_fs {
+	/* XFS_FSOP_GEOM_SICK_* flags */
+	__u32	mask;
+};
+
+/* ag/rtgroup metadata */
+struct xfs_health_monitor_group {
+	/* XFS_{AG,RTGROUP}_SICK_* flags */
+	__u32	mask;
+	__u32	gno;
+};
+
+/* inode metadata */
+struct xfs_health_monitor_inode {
+	/* XFS_BS_SICK_* flags */
+	__u32	mask;
+	__u32	gen;
+	__u64	ino;
+};
+
 struct xfs_health_monitor_event {
 	/* XFS_HEALTH_MONITOR_DOMAIN_* */
 	__u32	domain;
@@ -1039,6 +1071,9 @@ struct xfs_health_monitor_event {
 	 */
 	union {
 		struct xfs_health_monitor_lost lost;
+		struct xfs_health_monitor_fs fs;
+		struct xfs_health_monitor_group group;
+		struct xfs_health_monitor_inode inode;
 	} e;
 
 	/* zeroes */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index b31000f7190ce5..1d45cf5789e864 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -289,4 +289,9 @@ void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
 #define xfs_metadata_is_sick(error) \
 	(unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC))
 
+unsigned int xfs_healthmon_inode_mask(unsigned int sick_mask);
+unsigned int xfs_healthmon_rtgroup_mask(unsigned int sick_mask);
+unsigned int xfs_healthmon_perag_mask(unsigned int sick_mask);
+unsigned int xfs_healthmon_fs_mask(unsigned int sick_mask);
+
 #endif	/* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 3044bb46485d7e..121e5942639524 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -71,10 +71,21 @@ enum xfs_healthmon_type {
 	XFS_HEALTHMON_RUNNING,	/* monitor running */
 	XFS_HEALTHMON_LOST,	/* message lost */
 	XFS_HEALTHMON_UNMOUNT,	/* filesystem is unmounting */
+
+	/* metadata health events */
+	XFS_HEALTHMON_SICK,	/* runtime corruption observed */
+	XFS_HEALTHMON_CORRUPT,	/* fsck reported corruption */
+	XFS_HEALTHMON_HEALTHY,	/* fsck reported healthy structure */
 };
 
 enum xfs_healthmon_domain {
 	XFS_HEALTHMON_MOUNT,	/* affects the whole fs */
+
+	/* metadata health events */
+	XFS_HEALTHMON_FS,	/* main filesystem metadata */
+	XFS_HEALTHMON_AG,	/* allocation group metadata */
+	XFS_HEALTHMON_INODE,	/* inode metadata */
+	XFS_HEALTHMON_RTGROUP,	/* realtime group metadata */
 };
 
 struct xfs_healthmon_event {
@@ -90,9 +101,37 @@ struct xfs_healthmon_event {
 		struct {
 			uint64_t	lostcount;
 		};
+		/* fs/rt metadata */
+		struct {
+			/* XFS_SICK_* flags */
+			unsigned int	fsmask;
+		};
+		/* ag/rtgroup metadata */
+		struct {
+			/* XFS_SICK_(AG|RG)* flags */
+			unsigned int	grpmask;
+			unsigned int	group;
+		};
+		/* inode metadata */
+		struct {
+			/* XFS_SICK_INO_* flags */
+			unsigned int	imask;
+			uint32_t	gen;
+			xfs_ino_t	ino;
+		};
 	};
 };
 
+void xfs_healthmon_report_fs(struct xfs_mount *mp,
+		enum xfs_healthmon_type type, unsigned int old_mask,
+		unsigned int new_mask);
+void xfs_healthmon_report_group(struct xfs_group *xg,
+		enum xfs_healthmon_type type, unsigned int old_mask,
+		unsigned int new_mask);
+void xfs_healthmon_report_inode(struct xfs_inode *ip,
+		enum xfs_healthmon_type type, unsigned int old_mask,
+		unsigned int new_mask);
+
 long xfs_ioc_health_monitor(struct file *file,
 		struct xfs_health_monitor __user *arg);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 305cae8f497b43..debe9846418a04 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6009,15 +6009,29 @@ DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount);
 
 #define XFS_HEALTHMON_TYPE_STRINGS \
 	{ XFS_HEALTHMON_LOST,		"lost" }, \
-	{ XFS_HEALTHMON_UNMOUNT,	"unmount" }
+	{ XFS_HEALTHMON_UNMOUNT,	"unmount" }, \
+	{ XFS_HEALTHMON_SICK,		"sick" }, \
+	{ XFS_HEALTHMON_CORRUPT,	"corrupt" }, \
+	{ XFS_HEALTHMON_HEALTHY,	"healthy" }
 
 #define XFS_HEALTHMON_DOMAIN_STRINGS \
-	{ XFS_HEALTHMON_MOUNT,		"mount" }
+	{ XFS_HEALTHMON_MOUNT,		"mount" }, \
+	{ XFS_HEALTHMON_FS,		"fs" }, \
+	{ XFS_HEALTHMON_AG,		"ag" }, \
+	{ XFS_HEALTHMON_INODE,		"inode" }, \
+	{ XFS_HEALTHMON_RTGROUP,	"rtgroup" }
 
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST);
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_SICK);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_CORRUPT);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_HEALTHY);
 
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_FS);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_AG);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_INODE);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_RTGROUP);
 
 DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
 	TP_PROTO(const struct xfs_healthmon *hm,
@@ -6054,6 +6068,19 @@ DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
 				break;
 			}
 			break;
+		case XFS_HEALTHMON_FS:
+			__entry->mask = event->fsmask;
+			break;
+		case XFS_HEALTHMON_AG:
+		case XFS_HEALTHMON_RTGROUP:
+			__entry->mask = event->grpmask;
+			__entry->group = event->group;
+			break;
+		case XFS_HEALTHMON_INODE:
+			__entry->mask = event->imask;
+			__entry->ino = event->ino;
+			__entry->gen = event->gen;
+			break;
 		}
 	),
 	TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
@@ -6081,6 +6108,105 @@ DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format_overflow);
 DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_drop);
 DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_merge);
 
+TRACE_EVENT(xfs_healthmon_report_fs,
+	TP_PROTO(const struct xfs_healthmon *hm,
+		 unsigned int old_mask, unsigned int new_mask,
+		 const struct xfs_healthmon_event *event),
+	TP_ARGS(hm, old_mask, new_mask, event),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned int, domain)
+		__field(unsigned int, old_mask)
+		__field(unsigned int, new_mask)
+		__field(unsigned int, fsmask)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->type = event->type;
+		__entry->domain = event->domain;
+		__entry->old_mask = old_mask;
+		__entry->new_mask = new_mask;
+		__entry->fsmask = event->fsmask;
+	),
+	TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x fsmask 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
+		  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
+		  __entry->old_mask,
+		  __entry->new_mask,
+		  __entry->fsmask)
+);
+
+TRACE_EVENT(xfs_healthmon_report_group,
+	TP_PROTO(const struct xfs_healthmon *hm,
+		 unsigned int old_mask, unsigned int new_mask,
+		 const struct xfs_healthmon_event *event),
+	TP_ARGS(hm, old_mask, new_mask, event),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned int, domain)
+		__field(unsigned int, old_mask)
+		__field(unsigned int, new_mask)
+		__field(unsigned int, grpmask)
+		__field(unsigned int, group)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->type = event->type;
+		__entry->domain = event->domain;
+		__entry->old_mask = old_mask;
+		__entry->new_mask = new_mask;
+		__entry->grpmask = event->grpmask;
+		__entry->group = event->group;
+	),
+	TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x grpmask 0x%x group 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
+		  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
+		  __entry->old_mask,
+		  __entry->new_mask,
+		  __entry->grpmask,
+		  __entry->group)
+);
+
+TRACE_EVENT(xfs_healthmon_report_inode,
+	TP_PROTO(const struct xfs_healthmon *hm,
+		 unsigned int old_mask, unsigned int new_mask,
+		 const struct xfs_healthmon_event *event),
+	TP_ARGS(hm, old_mask, new_mask, event),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned int, domain)
+		__field(unsigned int, old_mask)
+		__field(unsigned int, new_mask)
+		__field(unsigned int, imask)
+		__field(unsigned long long, ino)
+		__field(unsigned int, gen)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->type = event->type;
+		__entry->domain = event->domain;
+		__entry->old_mask = old_mask;
+		__entry->new_mask = new_mask;
+		__entry->imask = event->imask;
+		__entry->ino = event->ino;
+		__entry->gen = event->gen;
+	),
+	TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x imask 0x%x ino 0x%llx gen 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS),
+		  __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS),
+		  __entry->old_mask,
+		  __entry->new_mask,
+		  __entry->imask,
+		  __entry->ino,
+		  __entry->gen)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 3d50397f8f7c00..f243c06fd44762 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -108,14 +108,19 @@ xfs_fs_mark_sick(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
 	trace_xfs_fs_mark_sick(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
+	old_mask = mp->m_fs_sick;
 	mp->m_fs_sick |= mask;
 	spin_unlock(&mp->m_sb_lock);
 
 	fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_fs(mp, XFS_HEALTHMON_SICK, old_mask, mask);
 }
 
 /* Mark per-fs metadata as having been checked and found unhealthy by fsck. */
@@ -124,15 +129,21 @@ xfs_fs_mark_corrupt(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
 	trace_xfs_fs_mark_corrupt(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
+	old_mask = mp->m_fs_sick;
 	mp->m_fs_sick |= mask;
 	mp->m_fs_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
 
 	fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_fs(mp, XFS_HEALTHMON_CORRUPT, old_mask,
+				mask);
 }
 
 /* Mark a per-fs metadata healed. */
@@ -141,15 +152,22 @@ xfs_fs_mark_healthy(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
 	trace_xfs_fs_mark_healthy(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
+	old_mask = mp->m_fs_sick;
 	mp->m_fs_sick &= ~mask;
 	if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY))
 		mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY;
 	mp->m_fs_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
+
+	if (mask)
+		xfs_healthmon_report_fs(mp, XFS_HEALTHMON_HEALTHY, old_mask,
+				mask);
 }
 
 /* Sample which per-fs metadata are unhealthy. */
@@ -199,14 +217,20 @@ xfs_group_mark_sick(
 	struct xfs_group	*xg,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	xfs_group_check_mask(xg, mask);
 	trace_xfs_group_mark_sick(xg, mask);
 
 	spin_lock(&xg->xg_state_lock);
+	old_mask = xg->xg_sick;
 	xg->xg_sick |= mask;
 	spin_unlock(&xg->xg_state_lock);
 
 	fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_group(xg, XFS_HEALTHMON_SICK, old_mask,
+				mask);
 }
 
 /*
@@ -217,15 +241,21 @@ xfs_group_mark_corrupt(
 	struct xfs_group	*xg,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	xfs_group_check_mask(xg, mask);
 	trace_xfs_group_mark_corrupt(xg, mask);
 
 	spin_lock(&xg->xg_state_lock);
+	old_mask = xg->xg_sick;
 	xg->xg_sick |= mask;
 	xg->xg_checked |= mask;
 	spin_unlock(&xg->xg_state_lock);
 
 	fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_group(xg, XFS_HEALTHMON_CORRUPT, old_mask,
+				mask);
 }
 
 /*
@@ -236,15 +266,22 @@ xfs_group_mark_healthy(
 	struct xfs_group	*xg,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	xfs_group_check_mask(xg, mask);
 	trace_xfs_group_mark_healthy(xg, mask);
 
 	spin_lock(&xg->xg_state_lock);
+	old_mask = xg->xg_sick;
 	xg->xg_sick &= ~mask;
 	if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY))
 		xg->xg_sick &= ~XFS_SICK_AG_SECONDARY;
 	xg->xg_checked |= mask;
 	spin_unlock(&xg->xg_state_lock);
+
+	if (mask)
+		xfs_healthmon_report_group(xg, XFS_HEALTHMON_HEALTHY, old_mask,
+				mask);
 }
 
 /* Sample which per-ag metadata are unhealthy. */
@@ -283,10 +320,13 @@ xfs_inode_mark_sick(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
 	trace_xfs_inode_mark_sick(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
+	old_mask = ip->i_sick;
 	ip->i_sick |= mask;
 	spin_unlock(&ip->i_flags_lock);
 
@@ -300,6 +340,9 @@ xfs_inode_mark_sick(
 	spin_unlock(&VFS_I(ip)->i_lock);
 
 	fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_inode(ip, XFS_HEALTHMON_SICK, old_mask,
+				mask);
 }
 
 /* Mark inode metadata as having been checked and found unhealthy by fsck. */
@@ -308,10 +351,13 @@ xfs_inode_mark_corrupt(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
 	trace_xfs_inode_mark_corrupt(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
+	old_mask = ip->i_sick;
 	ip->i_sick |= mask;
 	ip->i_checked |= mask;
 	spin_unlock(&ip->i_flags_lock);
@@ -326,6 +372,9 @@ xfs_inode_mark_corrupt(
 	spin_unlock(&VFS_I(ip)->i_lock);
 
 	fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS);
+	if (mask)
+		xfs_healthmon_report_inode(ip, XFS_HEALTHMON_CORRUPT, old_mask,
+				mask);
 }
 
 /* Mark parts of an inode healed. */
@@ -334,15 +383,22 @@ xfs_inode_mark_healthy(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
+	unsigned int		old_mask;
+
 	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
 	trace_xfs_inode_mark_healthy(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
+	old_mask = ip->i_sick;
 	ip->i_sick &= ~mask;
 	if (!(ip->i_sick & XFS_SICK_INO_PRIMARY))
 		ip->i_sick &= ~XFS_SICK_INO_SECONDARY;
 	ip->i_checked |= mask;
 	spin_unlock(&ip->i_flags_lock);
+
+	if (mask)
+		xfs_healthmon_report_inode(ip, XFS_HEALTHMON_HEALTHY, old_mask,
+				mask);
 }
 
 /* Sample which parts of an inode are unhealthy. */
@@ -422,6 +478,25 @@ xfs_fsop_geom_health(
 	}
 }
 
+/*
+ * Translate XFS_SICK_FS_* into XFS_FSOP_GEOM_SICK_* except for the rt free
+ * space codes, which are sent via the rtgroup events.
+ */
+unsigned int
+xfs_healthmon_fs_mask(
+	unsigned int			sick_mask)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			ioctl_mask = 0;
+
+	for_each_sick_map(fs_map, m) {
+		if (sick_mask & m->sick_mask)
+			ioctl_mask |= m->ioctl_mask;
+	}
+
+	return ioctl_mask;
+}
+
 static const struct ioctl_sick_map ag_map[] = {
 	{ XFS_SICK_AG_SB,	XFS_AG_GEOM_SICK_SB },
 	{ XFS_SICK_AG_AGF,	XFS_AG_GEOM_SICK_AGF },
@@ -458,6 +533,22 @@ xfs_ag_geom_health(
 	}
 }
 
+/* Translate XFS_SICK_AG_* into XFS_AG_GEOM_SICK_*. */
+unsigned int
+xfs_healthmon_perag_mask(
+	unsigned int			sick_mask)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			ioctl_mask = 0;
+
+	for_each_sick_map(ag_map, m) {
+		if (sick_mask & m->sick_mask)
+			ioctl_mask |= m->ioctl_mask;
+	}
+
+	return ioctl_mask;
+}
+
 static const struct ioctl_sick_map rtgroup_map[] = {
 	{ XFS_SICK_RG_SUPER,	XFS_RTGROUP_GEOM_SICK_SUPER },
 	{ XFS_SICK_RG_BITMAP,	XFS_RTGROUP_GEOM_SICK_BITMAP },
@@ -488,6 +579,22 @@ xfs_rtgroup_geom_health(
 	}
 }
 
+/* Translate XFS_SICK_RG_* into XFS_RTGROUP_GEOM_SICK_*. */
+unsigned int
+xfs_healthmon_rtgroup_mask(
+	unsigned int			sick_mask)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			ioctl_mask = 0;
+
+	for_each_sick_map(rtgroup_map, m) {
+		if (sick_mask & m->sick_mask)
+			ioctl_mask |= m->ioctl_mask;
+	}
+
+	return ioctl_mask;
+}
+
 static const struct ioctl_sick_map ino_map[] = {
 	{ XFS_SICK_INO_CORE,	XFS_BS_SICK_INODE },
 	{ XFS_SICK_INO_BMBTD,	XFS_BS_SICK_BMBTD },
@@ -526,6 +633,22 @@ xfs_bulkstat_health(
 	}
 }
 
+/* Translate XFS_SICK_INO_* into XFS_BS_SICK_*. */
+unsigned int
+xfs_healthmon_inode_mask(
+	unsigned int			sick_mask)
+{
+	const struct ioctl_sick_map	*m;
+	unsigned int			ioctl_mask = 0;
+
+	for_each_sick_map(ino_map, m) {
+		if (sick_mask & m->sick_mask)
+			ioctl_mask |= m->ioctl_mask;
+	}
+
+	return ioctl_mask;
+}
+
 /* Mark a block mapping sick. */
 void
 xfs_bmap_mark_sick(
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index c218838e6e59f4..0039a79822e86a 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -18,6 +18,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_health.h"
 #include "xfs_healthmon.h"
 
 #include <linux/anon_inodes.h>
@@ -174,6 +175,33 @@ xfs_healthmon_merge_events(
 	case XFS_HEALTHMON_LOST:
 		existing->lostcount += new->lostcount;
 		return true;
+
+	case XFS_HEALTHMON_SICK:
+	case XFS_HEALTHMON_CORRUPT:
+	case XFS_HEALTHMON_HEALTHY:
+		switch (existing->domain) {
+		case XFS_HEALTHMON_FS:
+			existing->fsmask |= new->fsmask;
+			return true;
+		case XFS_HEALTHMON_AG:
+		case XFS_HEALTHMON_RTGROUP:
+			if (existing->group == new->group){
+				existing->grpmask |= new->grpmask;
+				return true;
+			}
+			return false;
+		case XFS_HEALTHMON_INODE:
+			if (existing->ino == new->ino &&
+			    existing->gen == new->gen) {
+				existing->imask |= new->imask;
+				return true;
+			}
+			return false;
+		default:
+			ASSERT(0);
+			return false;
+		}
+		return false;
 	}
 
 	return false;
@@ -337,6 +365,135 @@ xfs_healthmon_unmount(
 	xfs_healthmon_put(hm);
 }
 
+/* Compute the reporting mask for non-unmount metadata health events. */
+static inline unsigned int
+metadata_event_mask(
+	struct xfs_healthmon		*hm,
+	enum xfs_healthmon_type		type,
+	unsigned int			old_mask,
+	unsigned int			new_mask)
+{
+	/* If we want all events, return all events. */
+	if (hm->verbose)
+		return new_mask;
+
+	switch (type) {
+	case XFS_HEALTHMON_SICK:
+		/* Always report runtime corruptions */
+		return new_mask;
+	case XFS_HEALTHMON_CORRUPT:
+		/* Only report new fsck errors */
+		return new_mask & ~old_mask;
+	case XFS_HEALTHMON_HEALTHY:
+		/* Only report healthy metadata that got fixed */
+		return new_mask & old_mask;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return 0;
+}
+
+/* Report XFS_FS_SICK_* events to healthmon */
+void
+xfs_healthmon_report_fs(
+	struct xfs_mount		*mp,
+	enum xfs_healthmon_type		type,
+	unsigned int			old_mask,
+	unsigned int			new_mask)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= type,
+		.domain			= XFS_HEALTHMON_FS,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
+			~XFS_SICK_FS_SECONDARY;
+	trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
+
+	if (event.fsmask)
+		xfs_healthmon_push(hm, &event);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Report XFS_SICK_(AG|RG)* flags to healthmon */
+void
+xfs_healthmon_report_group(
+	struct xfs_group		*xg,
+	enum xfs_healthmon_type		type,
+	unsigned int			old_mask,
+	unsigned int			new_mask)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= type,
+		.group			= xg->xg_gno,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(xg->xg_mount);
+
+	if (!hm)
+		return;
+
+	switch (xg->xg_type) {
+	case XG_TYPE_RTG:
+		event.domain = XFS_HEALTHMON_RTGROUP;
+		event.grpmask = metadata_event_mask(hm, type, old_mask,
+						    new_mask) &
+				~XFS_SICK_RG_SECONDARY;
+		break;
+	case XG_TYPE_AG:
+		event.domain = XFS_HEALTHMON_AG;
+		event.grpmask = metadata_event_mask(hm, type, old_mask,
+						    new_mask) &
+				~XFS_SICK_AG_SECONDARY;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
+
+	if (event.grpmask)
+		xfs_healthmon_push(hm, &event);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Report XFS_SICK_INO_* flags to healthmon */
+void
+xfs_healthmon_report_inode(
+	struct xfs_inode		*ip,
+	enum xfs_healthmon_type		type,
+	unsigned int			old_mask,
+	unsigned int			new_mask)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= type,
+		.domain			= XFS_HEALTHMON_INODE,
+		.ino			= ip->i_ino,
+		.gen			= VFS_I(ip)->i_generation,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);
+
+	if (!hm)
+		return;
+
+	event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
+			~XFS_SICK_INO_SECONDARY;
+	trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
+
+	if (event.imask)
+		xfs_healthmon_push(hm, &event);
+
+	xfs_healthmon_put(hm);
+}
+
 static inline void
 xfs_healthmon_reset_outbuf(
 	struct xfs_healthmon		*hm)
@@ -347,11 +504,19 @@ xfs_healthmon_reset_outbuf(
 
 static const unsigned int domain_map[] = {
 	[XFS_HEALTHMON_MOUNT]		= XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
+	[XFS_HEALTHMON_FS]		= XFS_HEALTH_MONITOR_DOMAIN_FS,
+	[XFS_HEALTHMON_AG]		= XFS_HEALTH_MONITOR_DOMAIN_AG,
+	[XFS_HEALTHMON_INODE]		= XFS_HEALTH_MONITOR_DOMAIN_INODE,
+	[XFS_HEALTHMON_RTGROUP]		= XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
 };
 
 static const unsigned int type_map[] = {
 	[XFS_HEALTHMON_RUNNING]		= XFS_HEALTH_MONITOR_TYPE_RUNNING,
 	[XFS_HEALTHMON_LOST]		= XFS_HEALTH_MONITOR_TYPE_LOST,
+	[XFS_HEALTHMON_SICK]		= XFS_HEALTH_MONITOR_TYPE_SICK,
+	[XFS_HEALTHMON_CORRUPT]		= XFS_HEALTH_MONITOR_TYPE_CORRUPT,
+	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
+	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
 };
 
 /* Render event as a V0 structure */
@@ -384,6 +549,22 @@ xfs_healthmon_format_v0(
 			break;
 		}
 		break;
+	case XFS_HEALTHMON_FS:
+		hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
+		break;
+	case XFS_HEALTHMON_RTGROUP:
+		hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
+		hme.e.group.gno = event->group;
+		break;
+	case XFS_HEALTHMON_AG:
+		hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
+		hme.e.group.gno = event->group;
+		break;
+	case XFS_HEALTHMON_INODE:
+		hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
+		hme.e.inode.ino = event->ino;
+		hme.e.inode.gen = event->gen;
+		break;
 	default:
 		break;
 	}


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 06/11] xfs: convey filesystem shutdown events to the health monitor
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (4 preceding siblings ...)
  2026-01-16  5:43 ` [PATCH 05/11] xfs: convey metadata health " Darrick J. Wong
@ 2026-01-16  5:43 ` Darrick J. Wong
  2026-01-19 15:44   ` Christoph Hellwig
  2026-01-16  5:43 ` [PATCH 07/11] xfs: convey externally discovered fsdax media errors " Darrick J. Wong
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Connect the filesystem shutdown code to the health monitor so that xfs
can send events about that to the xfs_healer daemon.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |   18 ++++++++++++
 fs/xfs/xfs_healthmon.h |    9 ++++++
 fs/xfs/xfs_trace.h     |   23 +++++++++++++++-
 fs/xfs/xfs_fsops.c     |    2 +
 fs/xfs/xfs_healthmon.c |   70 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 04e1dcf61257d0..c8f7011a7ef8ef 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1028,6 +1028,9 @@ struct xfs_rtgroup_geometry {
 #define XFS_HEALTH_MONITOR_TYPE_CORRUPT		(4)
 #define XFS_HEALTH_MONITOR_TYPE_HEALTHY		(5)
 
+/* filesystem shutdown */
+#define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN	(6)
+
 /* lost events */
 struct xfs_health_monitor_lost {
 	__u64	count;
@@ -1054,6 +1057,20 @@ struct xfs_health_monitor_inode {
 	__u64	ino;
 };
 
+/* shutdown reasons */
+#define XFS_HEALTH_SHUTDOWN_META_IO_ERROR	(1u << 0)
+#define XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR	(1u << 1)
+#define XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT	(1u << 2)
+#define XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE	(1u << 3)
+#define XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK	(1u << 4)
+#define XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED	(1u << 5)
+
+/* shutdown */
+struct xfs_health_monitor_shutdown {
+	/* XFS_HEALTH_SHUTDOWN_* flags */
+	__u32	reasons;
+};
+
 struct xfs_health_monitor_event {
 	/* XFS_HEALTH_MONITOR_DOMAIN_* */
 	__u32	domain;
@@ -1074,6 +1091,7 @@ struct xfs_health_monitor_event {
 		struct xfs_health_monitor_fs fs;
 		struct xfs_health_monitor_group group;
 		struct xfs_health_monitor_inode inode;
+		struct xfs_health_monitor_shutdown shutdown;
 	} e;
 
 	/* zeroes */
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 121e5942639524..1f68b5d65a8edc 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -72,6 +72,9 @@ enum xfs_healthmon_type {
 	XFS_HEALTHMON_LOST,	/* message lost */
 	XFS_HEALTHMON_UNMOUNT,	/* filesystem is unmounting */
 
+	/* filesystem shutdown */
+	XFS_HEALTHMON_SHUTDOWN,
+
 	/* metadata health events */
 	XFS_HEALTHMON_SICK,	/* runtime corruption observed */
 	XFS_HEALTHMON_CORRUPT,	/* fsck reported corruption */
@@ -119,6 +122,10 @@ struct xfs_healthmon_event {
 			uint32_t	gen;
 			xfs_ino_t	ino;
 		};
+		/* shutdown */
+		struct {
+			unsigned int	flags;
+		};
 	};
 };
 
@@ -132,6 +139,8 @@ void xfs_healthmon_report_inode(struct xfs_inode *ip,
 		enum xfs_healthmon_type type, unsigned int old_mask,
 		unsigned int new_mask);
 
+void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags);
+
 long xfs_ioc_health_monitor(struct file *file,
 		struct xfs_health_monitor __user *arg);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index debe9846418a04..ec99a6d3dd318c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6012,7 +6012,8 @@ DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount);
 	{ XFS_HEALTHMON_UNMOUNT,	"unmount" }, \
 	{ XFS_HEALTHMON_SICK,		"sick" }, \
 	{ XFS_HEALTHMON_CORRUPT,	"corrupt" }, \
-	{ XFS_HEALTHMON_HEALTHY,	"healthy" }
+	{ XFS_HEALTHMON_HEALTHY,	"healthy" }, \
+	{ XFS_HEALTHMON_SHUTDOWN,	"shutdown" }
 
 #define XFS_HEALTHMON_DOMAIN_STRINGS \
 	{ XFS_HEALTHMON_MOUNT,		"mount" }, \
@@ -6022,6 +6023,7 @@ DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount);
 	{ XFS_HEALTHMON_RTGROUP,	"rtgroup" }
 
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST);
+TRACE_DEFINE_ENUM(XFS_HEALTHMON_SHUTDOWN);
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT);
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_SICK);
 TRACE_DEFINE_ENUM(XFS_HEALTHMON_CORRUPT);
@@ -6063,6 +6065,9 @@ DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
 		switch (__entry->domain) {
 		case XFS_HEALTHMON_MOUNT:
 			switch (__entry->type) {
+			case XFS_HEALTHMON_SHUTDOWN:
+				__entry->mask = event->flags;
+				break;
 			case XFS_HEALTHMON_LOST:
 				__entry->lostcount = event->lostcount;
 				break;
@@ -6207,6 +6212,22 @@ TRACE_EVENT(xfs_healthmon_report_inode,
 		  __entry->gen)
 );
 
+TRACE_EVENT(xfs_healthmon_report_shutdown,
+	TP_PROTO(const struct xfs_healthmon *hm, uint32_t shutdown_flags),
+	TP_ARGS(hm, shutdown_flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint32_t, shutdown_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->shutdown_flags = shutdown_flags;
+	),
+	TP_printk("dev %d:%d shutdown_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS))
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b7c21f68edc78d..368173bf8a4091 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -25,6 +25,7 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_metafile.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
@@ -544,6 +545,7 @@ xfs_do_force_shutdown(
 		xfs_stack_trace();
 
 	fserror_report_shutdown(mp->m_super, GFP_KERNEL);
+	xfs_healthmon_report_shutdown(mp, flags);
 }
 
 /*
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index 0039a79822e86a..97f764e7954152 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -20,6 +20,7 @@
 #include "xfs_rtgroup.h"
 #include "xfs_health.h"
 #include "xfs_healthmon.h"
+#include "xfs_fsops.h"
 
 #include <linux/anon_inodes.h>
 #include <linux/eventpoll.h>
@@ -202,6 +203,11 @@ xfs_healthmon_merge_events(
 			return false;
 		}
 		return false;
+
+	case XFS_HEALTHMON_SHUTDOWN:
+		/* yes, we can race to shutdown */
+		existing->flags |= new->flags;
+		return true;
 	}
 
 	return false;
@@ -494,6 +500,28 @@ xfs_healthmon_report_inode(
 	xfs_healthmon_put(hm);
 }
 
+/* Add a shutdown event to the reporting queue. */
+void
+xfs_healthmon_report_shutdown(
+	struct xfs_mount		*mp,
+	uint32_t			flags)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= XFS_HEALTHMON_SHUTDOWN,
+		.domain			= XFS_HEALTHMON_MOUNT,
+		.flags			= flags,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	trace_xfs_healthmon_report_shutdown(hm, flags);
+
+	xfs_healthmon_push(hm, &event);
+	xfs_healthmon_put(hm);
+}
+
 static inline void
 xfs_healthmon_reset_outbuf(
 	struct xfs_healthmon		*hm)
@@ -502,6 +530,44 @@ xfs_healthmon_reset_outbuf(
 	hm->bufhead = 0;
 }
 
+struct flags_map {
+	unsigned int		in_mask;
+	unsigned int		out_mask;
+};
+
+static const struct flags_map shutdown_map[] = {
+	{ SHUTDOWN_META_IO_ERROR,	XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
+	{ SHUTDOWN_LOG_IO_ERROR,	XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
+	{ SHUTDOWN_FORCE_UMOUNT,	XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
+	{ SHUTDOWN_CORRUPT_INCORE,	XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
+	{ SHUTDOWN_CORRUPT_ONDISK,	XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
+	{ SHUTDOWN_DEVICE_REMOVED,	XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
+};
+
+static inline unsigned int
+__map_flags(
+	const struct flags_map	*map,
+	size_t			array_len,
+	unsigned int		flags)
+{
+	const struct flags_map	*m;
+	unsigned int		ret = 0;
+
+	for (m = map; m < map + array_len; m++) {
+		if (flags & m->in_mask)
+			ret |= m->out_mask;
+	}
+
+	return ret;
+}
+
+#define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
+
+static inline unsigned int shutdown_mask(unsigned int in)
+{
+	return map_flags(shutdown_map, in);
+}
+
 static const unsigned int domain_map[] = {
 	[XFS_HEALTHMON_MOUNT]		= XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
 	[XFS_HEALTHMON_FS]		= XFS_HEALTH_MONITOR_DOMAIN_FS,
@@ -517,6 +583,7 @@ static const unsigned int type_map[] = {
 	[XFS_HEALTHMON_CORRUPT]		= XFS_HEALTH_MONITOR_TYPE_CORRUPT,
 	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
 	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
+	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
 };
 
 /* Render event as a V0 structure */
@@ -545,6 +612,9 @@ xfs_healthmon_format_v0(
 		case XFS_HEALTHMON_LOST:
 			hme.e.lost.count = event->lostcount;
 			break;
+		case XFS_HEALTHMON_SHUTDOWN:
+			hme.e.shutdown.reasons = shutdown_mask(event->flags);
+			break;
 		default:
 			break;
 		}


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 07/11] xfs: convey externally discovered fsdax media errors to the health monitor
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (5 preceding siblings ...)
  2026-01-16  5:43 ` [PATCH 06/11] xfs: convey filesystem shutdown " Darrick J. Wong
@ 2026-01-16  5:43 ` Darrick J. Wong
  2026-01-16  5:44 ` [PATCH 08/11] xfs: convey file I/O " Darrick J. Wong
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:43 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Connect the fsdax media failure notification code to the health monitor
so that xfs can send events about that to the xfs_healer daemon.

Later on we'll add the ability for the xfs_scrub media scan (phase 6) to
report the errors that it finds to the kernel so that those are also
logged by xfs_healer.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h      |   15 ++++++++++
 fs/xfs/xfs_healthmon.h      |   16 ++++++++++
 fs/xfs/xfs_trace.h          |   38 +++++++++++++++++++++++++
 fs/xfs/xfs_healthmon.c      |   66 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_notify_failure.c |   17 ++++++++---
 fs/xfs/xfs_trace.c          |    1 +
 6 files changed, 148 insertions(+), 5 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index c8f7011a7ef8ef..38aeb1b0d87b5e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1014,6 +1014,11 @@ struct xfs_rtgroup_geometry {
 #define XFS_HEALTH_MONITOR_DOMAIN_INODE		(3)
 #define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP	(4)
 
+/* disk events */
+#define XFS_HEALTH_MONITOR_DOMAIN_DATADEV	(5)
+#define XFS_HEALTH_MONITOR_DOMAIN_RTDEV		(6)
+#define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV	(7)
+
 /* Health monitor event types */
 
 /* status of the monitor itself */
@@ -1031,6 +1036,9 @@ struct xfs_rtgroup_geometry {
 /* filesystem shutdown */
 #define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN	(6)
 
+/* media errors */
+#define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR	(7)
+
 /* lost events */
 struct xfs_health_monitor_lost {
 	__u64	count;
@@ -1071,6 +1079,12 @@ struct xfs_health_monitor_shutdown {
 	__u32	reasons;
 };
 
+/* disk media errors */
+struct xfs_health_monitor_media {
+	__u64	daddr;
+	__u64	bbcount;
+};
+
 struct xfs_health_monitor_event {
 	/* XFS_HEALTH_MONITOR_DOMAIN_* */
 	__u32	domain;
@@ -1092,6 +1106,7 @@ struct xfs_health_monitor_event {
 		struct xfs_health_monitor_group group;
 		struct xfs_health_monitor_inode inode;
 		struct xfs_health_monitor_shutdown shutdown;
+		struct xfs_health_monitor_media media;
 	} e;
 
 	/* zeroes */
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 1f68b5d65a8edc..54536aac427813 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -79,6 +79,9 @@ enum xfs_healthmon_type {
 	XFS_HEALTHMON_SICK,	/* runtime corruption observed */
 	XFS_HEALTHMON_CORRUPT,	/* fsck reported corruption */
 	XFS_HEALTHMON_HEALTHY,	/* fsck reported healthy structure */
+
+	/* media errors */
+	XFS_HEALTHMON_MEDIA_ERROR,
 };
 
 enum xfs_healthmon_domain {
@@ -89,6 +92,11 @@ enum xfs_healthmon_domain {
 	XFS_HEALTHMON_AG,	/* allocation group metadata */
 	XFS_HEALTHMON_INODE,	/* inode metadata */
 	XFS_HEALTHMON_RTGROUP,	/* realtime group metadata */
+
+	/* media errors */
+	XFS_HEALTHMON_DATADEV,
+	XFS_HEALTHMON_RTDEV,
+	XFS_HEALTHMON_LOGDEV,
 };
 
 struct xfs_healthmon_event {
@@ -126,6 +134,11 @@ struct xfs_healthmon_event {
 		struct {
 			unsigned int	flags;
 		};
+		/* media errors */
+		struct {
+			xfs_daddr_t	daddr;
+			uint64_t	bbcount;
+		};
 	};
 };
 
@@ -141,6 +154,9 @@ void xfs_healthmon_report_inode(struct xfs_inode *ip,
 
 void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags);
 
+void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev,
+		xfs_daddr_t daddr, uint64_t bbcount);
+
 long xfs_ioc_health_monitor(struct file *file,
 		struct xfs_health_monitor __user *arg);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ec99a6d3dd318c..fe7295a4e917ee 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6086,6 +6086,12 @@ DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
 			__entry->ino = event->ino;
 			__entry->gen = event->gen;
 			break;
+		case XFS_HEALTHMON_DATADEV:
+		case XFS_HEALTHMON_LOGDEV:
+		case XFS_HEALTHMON_RTDEV:
+			__entry->offset = event->daddr;
+			__entry->length = event->bbcount;
+			break;
 		}
 	),
 	TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
@@ -6228,6 +6234,38 @@ TRACE_EVENT(xfs_healthmon_report_shutdown,
 		  __print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS))
 );
 
+#define XFS_DEVICE_STRINGS \
+	{ XFS_DEV_DATA,		"datadev" }, \
+	{ XFS_DEV_RT,		"rtdev" }, \
+	{ XFS_DEV_LOG,		"logdev" }
+
+TRACE_DEFINE_ENUM(XFS_DEV_DATA);
+TRACE_DEFINE_ENUM(XFS_DEV_RT);
+TRACE_DEFINE_ENUM(XFS_DEV_LOG);
+
+TRACE_EVENT(xfs_healthmon_report_media,
+	TP_PROTO(const struct xfs_healthmon *hm, enum xfs_device fdev,
+		 const struct xfs_healthmon_event *event),
+	TP_ARGS(hm, fdev, event),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, error_dev)
+		__field(uint64_t, daddr)
+		__field(uint64_t, bbcount)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->error_dev = fdev;
+		__entry->daddr = event->daddr;
+		__entry->bbcount = event->bbcount;
+	),
+	TP_printk("dev %d:%d %s daddr 0x%llx bbcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->error_dev, XFS_DEVICE_STRINGS),
+		  __entry->daddr,
+		  __entry->bbcount)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index 97f764e7954152..773bd4414d947a 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -21,6 +21,7 @@
 #include "xfs_health.h"
 #include "xfs_healthmon.h"
 #include "xfs_fsops.h"
+#include "xfs_notify_failure.h"
 
 #include <linux/anon_inodes.h>
 #include <linux/eventpoll.h>
@@ -208,6 +209,19 @@ xfs_healthmon_merge_events(
 		/* yes, we can race to shutdown */
 		existing->flags |= new->flags;
 		return true;
+
+	case XFS_HEALTHMON_MEDIA_ERROR:
+		/* physically adjacent errors can merge */
+		if (existing->daddr + existing->bbcount == new->daddr) {
+			existing->bbcount += new->bbcount;
+			return true;
+		}
+		if (new->daddr + new->bbcount == existing->daddr) {
+			existing->daddr = new->daddr;
+			existing->bbcount += new->bbcount;
+			return true;
+		}
+		return false;
 	}
 
 	return false;
@@ -522,6 +536,48 @@ xfs_healthmon_report_shutdown(
 	xfs_healthmon_put(hm);
 }
 
+static inline enum xfs_healthmon_domain
+media_error_domain(
+	enum xfs_device			fdev)
+{
+	switch (fdev) {
+	case XFS_DEV_DATA:
+		return XFS_HEALTHMON_DATADEV;
+	case XFS_DEV_LOG:
+		return XFS_HEALTHMON_LOGDEV;
+	case XFS_DEV_RT:
+		return XFS_HEALTHMON_RTDEV;
+	}
+
+	ASSERT(0);
+	return 0;
+}
+
+/* Add a media error event to the reporting queue. */
+void
+xfs_healthmon_report_media(
+	struct xfs_mount		*mp,
+	enum xfs_device			fdev,
+	xfs_daddr_t			daddr,
+	uint64_t			bbcount)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= XFS_HEALTHMON_MEDIA_ERROR,
+		.domain			= media_error_domain(fdev),
+		.daddr			= daddr,
+		.bbcount		= bbcount,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	trace_xfs_healthmon_report_media(hm, fdev, &event);
+
+	xfs_healthmon_push(hm, &event);
+	xfs_healthmon_put(hm);
+}
+
 static inline void
 xfs_healthmon_reset_outbuf(
 	struct xfs_healthmon		*hm)
@@ -574,6 +630,9 @@ static const unsigned int domain_map[] = {
 	[XFS_HEALTHMON_AG]		= XFS_HEALTH_MONITOR_DOMAIN_AG,
 	[XFS_HEALTHMON_INODE]		= XFS_HEALTH_MONITOR_DOMAIN_INODE,
 	[XFS_HEALTHMON_RTGROUP]		= XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
+	[XFS_HEALTHMON_DATADEV]		= XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
+	[XFS_HEALTHMON_RTDEV]		= XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
+	[XFS_HEALTHMON_LOGDEV]		= XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
 };
 
 static const unsigned int type_map[] = {
@@ -584,6 +643,7 @@ static const unsigned int type_map[] = {
 	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
 	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
 	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
+	[XFS_HEALTHMON_MEDIA_ERROR]	= XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
 };
 
 /* Render event as a V0 structure */
@@ -635,6 +695,12 @@ xfs_healthmon_format_v0(
 		hme.e.inode.ino = event->ino;
 		hme.e.inode.gen = event->gen;
 		break;
+	case XFS_HEALTHMON_DATADEV:
+	case XFS_HEALTHMON_LOGDEV:
+	case XFS_HEALTHMON_RTDEV:
+		hme.e.media.daddr = event->daddr;
+		hme.e.media.bbcount = event->bbcount;
+		break;
 	default:
 		break;
 	}
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 6d5002413c2cb4..1edc4ddd10cdb2 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
 #include "xfs_notify_failure.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mm.h>
 #include <linux/dax.h>
@@ -219,6 +220,8 @@ xfs_dax_notify_logdev_failure(
 	if (error)
 		return error;
 
+	xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen);
+
 	/*
 	 * In the pre-remove case the failure notification is attempting to
 	 * trigger a force unmount.  The expectation is that the device is
@@ -252,16 +255,20 @@ xfs_dax_notify_dev_failure(
 	uint64_t		bblen;
 	struct xfs_group	*xg = NULL;
 
+	error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
+			offset, len, &daddr, &bblen);
+	if (error)
+		return error;
+
+	xfs_healthmon_report_media(mp,
+			type == XG_TYPE_RTG ?  XFS_DEV_RT : XFS_DEV_DATA,
+			daddr, bblen);
+
 	if (!xfs_has_rmapbt(mp)) {
 		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
 		return -EOPNOTSUPP;
 	}
 
-	error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
-			offset, len, &daddr, &bblen);
-	if (error)
-		return error;
-
 	if (type == XG_TYPE_RTG) {
 		start_bno = xfs_daddr_to_rtb(mp, daddr);
 		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index d42b864a3837a2..08ddab700a6cd3 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -53,6 +53,7 @@
 #include "xfs_zone_priv.h"
 #include "xfs_health.h"
 #include "xfs_healthmon.h"
+#include "xfs_notify_failure.h"
 
 /*
  * We include this last to have the helpers above available for the trace


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 08/11] xfs: convey file I/O errors to the health monitor
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (6 preceding siblings ...)
  2026-01-16  5:43 ` [PATCH 07/11] xfs: convey externally discovered fsdax media errors " Darrick J. Wong
@ 2026-01-16  5:44 ` Darrick J. Wong
  2026-01-16  5:44 ` [PATCH 09/11] xfs: allow toggling verbose logging on the health monitoring file Darrick J. Wong
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Connect the fserror reporting to the health monitor so that xfs can send
events about file I/O errors to the xfs_healer daemon.  These events are
entirely informational because xfs cannot regenerate user data, so
hopefully the fsnotify I/O error event gets noticed by the relevant
management systems.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |   24 ++++++++++++++
 fs/xfs/xfs_healthmon.h |   21 ++++++++++++
 fs/xfs/xfs_trace.h     |   54 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_healthmon.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_super.c     |   12 +++++++
 fs/xfs/xfs_trace.c     |    2 +
 6 files changed, 198 insertions(+)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 38aeb1b0d87b5e..4ec1b2aede976f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1019,6 +1019,9 @@ struct xfs_rtgroup_geometry {
 #define XFS_HEALTH_MONITOR_DOMAIN_RTDEV		(6)
 #define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV	(7)
 
+/* file range events */
+#define XFS_HEALTH_MONITOR_DOMAIN_FILERANGE	(8)
+
 /* Health monitor event types */
 
 /* status of the monitor itself */
@@ -1039,6 +1042,17 @@ struct xfs_rtgroup_geometry {
 /* media errors */
 #define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR	(7)
 
+/* pagecache I/O to a file range failed */
+#define XFS_HEALTH_MONITOR_TYPE_BUFREAD		(8)
+#define XFS_HEALTH_MONITOR_TYPE_BUFWRITE	(9)
+
+/* direct I/O to a file range failed */
+#define XFS_HEALTH_MONITOR_TYPE_DIOREAD		(10)
+#define XFS_HEALTH_MONITOR_TYPE_DIOWRITE	(11)
+
+/* out of band media error reported for a file range */
+#define XFS_HEALTH_MONITOR_TYPE_DATALOST	(12)
+
 /* lost events */
 struct xfs_health_monitor_lost {
 	__u64	count;
@@ -1079,6 +1093,15 @@ struct xfs_health_monitor_shutdown {
 	__u32	reasons;
 };
 
+/* file range events */
+struct xfs_health_monitor_filerange {
+	__u64	pos;
+	__u64	len;
+	__u64	ino;
+	__u32	gen;
+	__u32	error;
+};
+
 /* disk media errors */
 struct xfs_health_monitor_media {
 	__u64	daddr;
@@ -1107,6 +1130,7 @@ struct xfs_health_monitor_event {
 		struct xfs_health_monitor_inode inode;
 		struct xfs_health_monitor_shutdown shutdown;
 		struct xfs_health_monitor_media media;
+		struct xfs_health_monitor_filerange filerange;
 	} e;
 
 	/* zeroes */
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
index 54536aac427813..0e936507037fda 100644
--- a/fs/xfs/xfs_healthmon.h
+++ b/fs/xfs/xfs_healthmon.h
@@ -82,6 +82,13 @@ enum xfs_healthmon_type {
 
 	/* media errors */
 	XFS_HEALTHMON_MEDIA_ERROR,
+
+	/* file range events */
+	XFS_HEALTHMON_BUFREAD,
+	XFS_HEALTHMON_BUFWRITE,
+	XFS_HEALTHMON_DIOREAD,
+	XFS_HEALTHMON_DIOWRITE,
+	XFS_HEALTHMON_DATALOST,
 };
 
 enum xfs_healthmon_domain {
@@ -97,6 +104,9 @@ enum xfs_healthmon_domain {
 	XFS_HEALTHMON_DATADEV,
 	XFS_HEALTHMON_RTDEV,
 	XFS_HEALTHMON_LOGDEV,
+
+	/* file range events */
+	XFS_HEALTHMON_FILERANGE,
 };
 
 struct xfs_healthmon_event {
@@ -139,6 +149,14 @@ struct xfs_healthmon_event {
 			xfs_daddr_t	daddr;
 			uint64_t	bbcount;
 		};
+		/* file range events */
+		struct {
+			xfs_ino_t	fino;
+			loff_t		fpos;
+			uint64_t	flen;
+			uint32_t	fgen;
+			int		error;
+		};
 	};
 };
 
@@ -157,6 +175,9 @@ void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags);
 void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev,
 		xfs_daddr_t daddr, uint64_t bbcount);
 
+void xfs_healthmon_report_file_ioerror(struct xfs_inode *ip,
+		const struct fserror_event *p);
+
 long xfs_ioc_health_monitor(struct file *file,
 		struct xfs_health_monitor __user *arg);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fe7295a4e917ee..0cf4877753584f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -105,6 +105,7 @@ struct xfs_rtgroup;
 struct xfs_open_zone;
 struct xfs_healthmon_event;
 struct xfs_healthmon;
+struct fserror_event;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -6092,6 +6093,12 @@ DECLARE_EVENT_CLASS(xfs_healthmon_event_class,
 			__entry->offset = event->daddr;
 			__entry->length = event->bbcount;
 			break;
+		case XFS_HEALTHMON_FILERANGE:
+			__entry->ino = event->fino;
+			__entry->gen = event->fgen;
+			__entry->offset = event->fpos;
+			__entry->length = event->flen;
+			break;
 		}
 	),
 	TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu",
@@ -6266,6 +6273,53 @@ TRACE_EVENT(xfs_healthmon_report_media,
 		  __entry->bbcount)
 );
 
+#define FS_ERROR_STRINGS \
+	{ FSERR_BUFFERED_READ,		"buffered_read" }, \
+	{ FSERR_BUFFERED_WRITE,		"buffered_write" }, \
+	{ FSERR_DIRECTIO_READ,		"directio_read" }, \
+	{ FSERR_DIRECTIO_WRITE,		"directio_write" }, \
+	{ FSERR_DATA_LOST,		"data_lost" }, \
+	{ FSERR_METADATA,		"metadata" }
+
+TRACE_DEFINE_ENUM(FSERR_BUFFERED_READ);
+TRACE_DEFINE_ENUM(FSERR_BUFFERED_WRITE);
+TRACE_DEFINE_ENUM(FSERR_DIRECTIO_READ);
+TRACE_DEFINE_ENUM(FSERR_DIRECTIO_WRITE);
+TRACE_DEFINE_ENUM(FSERR_DATA_LOST);
+TRACE_DEFINE_ENUM(FSERR_METADATA);
+
+TRACE_EVENT(xfs_healthmon_report_file_ioerror,
+	TP_PROTO(const struct xfs_healthmon *hm,
+		 const struct fserror_event *p),
+	TP_ARGS(hm, p),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, type)
+		__field(unsigned long long, ino)
+		__field(unsigned int, gen)
+		__field(long long, pos)
+		__field(unsigned long long, len)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = hm->dev;
+		__entry->type = p->type;
+		__entry->ino = XFS_I(p->inode)->i_ino;
+		__entry->gen = p->inode->i_generation;
+		__entry->pos = p->pos;
+		__entry->len = p->len;
+		__entry->error = p->error;
+	),
+	TP_printk("dev %d:%d ino 0x%llx gen 0x%x op %s pos 0x%llx bytecount 0x%llx error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->gen,
+		  __print_symbolic(__entry->type, FS_ERROR_STRINGS),
+		  __entry->pos,
+		  __entry->len,
+		  __entry->error)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index 773bd4414d947a..1bb4b0adf2470e 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -22,10 +22,12 @@
 #include "xfs_healthmon.h"
 #include "xfs_fsops.h"
 #include "xfs_notify_failure.h"
+#include "xfs_file.h"
 
 #include <linux/anon_inodes.h>
 #include <linux/eventpoll.h>
 #include <linux/poll.h>
+#include <linux/fserror.h>
 
 /*
  * Live Health Monitoring
@@ -222,6 +224,27 @@ xfs_healthmon_merge_events(
 			return true;
 		}
 		return false;
+
+	case XFS_HEALTHMON_BUFREAD:
+	case XFS_HEALTHMON_BUFWRITE:
+	case XFS_HEALTHMON_DIOREAD:
+	case XFS_HEALTHMON_DIOWRITE:
+	case XFS_HEALTHMON_DATALOST:
+		/* logically adjacent file ranges can merge */
+		if (existing->fino != new->fino || existing->fgen != new->fgen)
+			return false;
+
+		if (existing->fpos + existing->flen == new->fpos) {
+			existing->flen += new->flen;
+			return true;
+		}
+
+		if (new->fpos + new->flen == existing->fpos) {
+			existing->fpos = new->fpos;
+			existing->flen += new->flen;
+			return true;
+		}
+		return false;
 	}
 
 	return false;
@@ -578,6 +601,55 @@ xfs_healthmon_report_media(
 	xfs_healthmon_put(hm);
 }
 
+static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
+{
+	switch (action) {
+	case FSERR_BUFFERED_READ:
+		return XFS_HEALTHMON_BUFREAD;
+	case FSERR_BUFFERED_WRITE:
+		return XFS_HEALTHMON_BUFWRITE;
+	case FSERR_DIRECTIO_READ:
+		return XFS_HEALTHMON_DIOREAD;
+	case FSERR_DIRECTIO_WRITE:
+		return XFS_HEALTHMON_DIOWRITE;
+	case FSERR_DATA_LOST:
+		return XFS_HEALTHMON_DATALOST;
+	case FSERR_METADATA:
+		/* filtered out by xfs_fs_report_error */
+		break;
+	}
+
+	ASSERT(0);
+	return -1;
+}
+
+/* Add a file io error event to the reporting queue. */
+void
+xfs_healthmon_report_file_ioerror(
+	struct xfs_inode		*ip,
+	const struct fserror_event	*p)
+{
+	struct xfs_healthmon_event	event = {
+		.type			= file_ioerr_type(p->type),
+		.domain			= XFS_HEALTHMON_FILERANGE,
+		.fino			= ip->i_ino,
+		.fgen			= VFS_I(ip)->i_generation,
+		.fpos			= p->pos,
+		.flen			= p->len,
+		/* send positive error number to userspace */
+		.error			= -p->error,
+	};
+	struct xfs_healthmon		*hm = xfs_healthmon_get(ip->i_mount);
+
+	if (!hm)
+		return;
+
+	trace_xfs_healthmon_report_file_ioerror(hm, p);
+
+	xfs_healthmon_push(hm, &event);
+	xfs_healthmon_put(hm);
+}
+
 static inline void
 xfs_healthmon_reset_outbuf(
 	struct xfs_healthmon		*hm)
@@ -633,6 +705,7 @@ static const unsigned int domain_map[] = {
 	[XFS_HEALTHMON_DATADEV]		= XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
 	[XFS_HEALTHMON_RTDEV]		= XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
 	[XFS_HEALTHMON_LOGDEV]		= XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
+	[XFS_HEALTHMON_FILERANGE]	= XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
 };
 
 static const unsigned int type_map[] = {
@@ -644,6 +717,11 @@ static const unsigned int type_map[] = {
 	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
 	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
 	[XFS_HEALTHMON_MEDIA_ERROR]	= XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
+	[XFS_HEALTHMON_BUFREAD]		= XFS_HEALTH_MONITOR_TYPE_BUFREAD,
+	[XFS_HEALTHMON_BUFWRITE]	= XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
+	[XFS_HEALTHMON_DIOREAD]		= XFS_HEALTH_MONITOR_TYPE_DIOREAD,
+	[XFS_HEALTHMON_DIOWRITE]	= XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
+	[XFS_HEALTHMON_DATALOST]	= XFS_HEALTH_MONITOR_TYPE_DATALOST,
 };
 
 /* Render event as a V0 structure */
@@ -701,6 +779,13 @@ xfs_healthmon_format_v0(
 		hme.e.media.daddr = event->daddr;
 		hme.e.media.bbcount = event->bbcount;
 		break;
+	case XFS_HEALTHMON_FILERANGE:
+		hme.e.filerange.ino = event->fino;
+		hme.e.filerange.gen = event->fgen;
+		hme.e.filerange.pos = event->fpos;
+		hme.e.filerange.len = event->flen;
+		hme.e.filerange.error = abs(event->error);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1f432d6645898e..ad666d0c8d2d75 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,12 +47,14 @@
 #include "xfs_parent.h"
 #include "xfs_rtalloc.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/fserror.h>
 
 static const struct super_operations xfs_super_operations;
 
@@ -1301,6 +1303,15 @@ xfs_fs_show_stats(
 	return 0;
 }
 
+static void
+xfs_fs_report_error(
+	const struct fserror_event	*event)
+{
+	/* healthmon already knows about non-inode and metadata errors */
+	if (event->inode && event->type != FSERR_METADATA)
+		xfs_healthmon_report_file_ioerror(XFS_I(event->inode), event);
+}
+
 static const struct super_operations xfs_super_operations = {
 	.alloc_inode		= xfs_fs_alloc_inode,
 	.destroy_inode		= xfs_fs_destroy_inode,
@@ -1317,6 +1328,7 @@ static const struct super_operations xfs_super_operations = {
 	.free_cached_objects	= xfs_fs_free_cached_objects,
 	.shutdown		= xfs_fs_shutdown,
 	.show_stats		= xfs_fs_show_stats,
+	.report_error		= xfs_fs_report_error,
 };
 
 static int
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 08ddab700a6cd3..3ae449646eb9b2 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -54,6 +54,8 @@
 #include "xfs_health.h"
 #include "xfs_healthmon.h"
 #include "xfs_notify_failure.h"
+#include "xfs_file.h"
+#include <linux/fserror.h>
 
 /*
  * We include this last to have the helpers above available for the trace


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 09/11] xfs: allow toggling verbose logging on the health monitoring file
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (7 preceding siblings ...)
  2026-01-16  5:44 ` [PATCH 08/11] xfs: convey file I/O " Darrick J. Wong
@ 2026-01-16  5:44 ` Darrick J. Wong
  2026-01-16  5:44 ` [PATCH 10/11] xfs: check if an open file is on the health monitored fs Darrick J. Wong
  2026-01-16  5:44 ` [PATCH 11/11] xfs: add media verification ioctl Darrick J. Wong
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Make it so that we can reconfigure the health monitoring device by
calling the XFS_IOC_HEALTH_MONITOR ioctl on it.  As of right now we can
only toggle the verbose flag, but this is less annoying than having to
closing the monitor fd and reopen it.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_healthmon.c |   44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)


diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index 1bb4b0adf2470e..4a8cbd87932201 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -23,6 +23,7 @@
 #include "xfs_fsops.h"
 #include "xfs_notify_failure.h"
 #include "xfs_file.h"
+#include "xfs_ioctl.h"
 
 #include <linux/anon_inodes.h>
 #include <linux/eventpoll.h>
@@ -1066,12 +1067,55 @@ xfs_healthmon_show_fdinfo(
 	mutex_unlock(&hm->lock);
 }
 
+/* Reconfigure the health monitor. */
+STATIC long
+xfs_healthmon_reconfigure(
+	struct file			*file,
+	unsigned int			cmd,
+	void __user			*arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm = file->private_data;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	mutex_lock(&hm->lock);
+	hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE);
+	mutex_unlock(&hm->lock);
+
+	return 0;
+}
+
+/* Handle ioctls for the health monitoring thread. */
+STATIC long
+xfs_healthmon_ioctl(
+	struct file			*file,
+	unsigned int			cmd,
+	unsigned long			p)
+{
+	void __user			*arg = (void __user *)p;
+
+	switch (cmd) {
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_healthmon_reconfigure(file, cmd, arg);
+	default:
+		break;
+	}
+
+	return -ENOTTY;
+}
+
 static const struct file_operations xfs_healthmon_fops = {
 	.owner		= THIS_MODULE,
 	.show_fdinfo	= xfs_healthmon_show_fdinfo,
 	.read_iter	= xfs_healthmon_read_iter,
 	.poll		= xfs_healthmon_poll,
 	.release	= xfs_healthmon_release,
+	.unlocked_ioctl	= xfs_healthmon_ioctl,
 };
 
 /*


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 10/11] xfs: check if an open file is on the health monitored fs
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (8 preceding siblings ...)
  2026-01-16  5:44 ` [PATCH 09/11] xfs: allow toggling verbose logging on the health monitoring file Darrick J. Wong
@ 2026-01-16  5:44 ` Darrick J. Wong
  2026-01-16  5:44 ` [PATCH 11/11] xfs: add media verification ioctl Darrick J. Wong
  10 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: hch, linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Create a new ioctl for the healthmon file that checks that a given fd
points to the same filesystem that the healthmon file is monitoring.
This allows xfs_healer to check that when it reopens a mountpoint to
perform repairs, the file that it gets matches the filesystem that
generated the corruption report.

(Note that xfs_healer doesn't maintain an open fd to a filesystem that
it's monitoring so that it doesn't pin the mount.)

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |   12 +++++++++++-
 fs/xfs/xfs_healthmon.c |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 4ec1b2aede976f..a01303c5de6ce6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1151,6 +1151,15 @@ struct xfs_health_monitor {
 /* Initial return format version */
 #define XFS_HEALTH_MONITOR_FMT_V0	(0)
 
+/*
+ * Check that a given fd points to the same filesystem that the health monitor
+ * is monitoring.
+ */
+struct xfs_health_file_on_monitored_fs {
+	__s32		fd;
+	__u32		flags;	/* zero for now */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1191,7 +1200,8 @@ struct xfs_health_monitor {
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
 #define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
-
+#define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \
+				_IOW ('X', 69, struct xfs_health_file_on_monitored_fs)
 /*
  * ioctl commands that replace IRIX syssgi()'s
  */
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
index 4a8cbd87932201..3030fa93c1e575 100644
--- a/fs/xfs/xfs_healthmon.c
+++ b/fs/xfs/xfs_healthmon.c
@@ -1090,6 +1090,38 @@ xfs_healthmon_reconfigure(
 	return 0;
 }
 
+/* Does the fd point to the same filesystem as the one we're monitoring? */
+STATIC long
+xfs_healthmon_file_on_monitored_fs(
+	struct file			*file,
+	unsigned int			cmd,
+	void __user			*arg)
+{
+	struct xfs_health_file_on_monitored_fs hms;
+	struct xfs_healthmon		*hm = file->private_data;
+	struct inode			*hms_inode;
+
+	if (copy_from_user(&hms, arg, sizeof(hms)))
+		return -EFAULT;
+
+	if (hms.flags)
+		return -EINVAL;
+
+	CLASS(fd, hms_fd)(hms.fd);
+	if (fd_empty(hms_fd))
+		return -EBADF;
+
+	hms_inode = file_inode(fd_file(hms_fd));
+	mutex_lock(&hm->lock);
+	if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) {
+		mutex_unlock(&hm->lock);
+		return -ESTALE;
+	}
+
+	mutex_unlock(&hm->lock);
+	return 0;
+}
+
 /* Handle ioctls for the health monitoring thread. */
 STATIC long
 xfs_healthmon_ioctl(
@@ -1102,6 +1134,8 @@ xfs_healthmon_ioctl(
 	switch (cmd) {
 	case XFS_IOC_HEALTH_MONITOR:
 		return xfs_healthmon_reconfigure(file, cmd, arg);
+	case XFS_IOC_HEALTH_FD_ON_MONITORED_FS:
+		return xfs_healthmon_file_on_monitored_fs(file, cmd, arg);
 	default:
 		break;
 	}


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 11/11] xfs: add media verification ioctl
  2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
                   ` (9 preceding siblings ...)
  2026-01-16  5:44 ` [PATCH 10/11] xfs: check if an open file is on the health monitored fs Darrick J. Wong
@ 2026-01-16  5:44 ` Darrick J. Wong
  2026-01-19 15:56   ` Christoph Hellwig
  2026-01-20  4:12   ` [PATCH v6.1 " Darrick J. Wong
  10 siblings, 2 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-16  5:44 UTC (permalink / raw)
  To: cem, djwong; +Cc: linux-xfs, linux-fsdevel, hch

From: Darrick J. Wong <djwong@kernel.org>

Add a new privileged ioctl so that xfs_scrub can ask the kernel to
verify the media of the devices backing an xfs filesystem, and have any
resulting media errors reported to fsnotify and xfs_healer.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h    |   30 +++
 fs/xfs/xfs_trace.h        |   98 ++++++++++
 fs/xfs/xfs_verify_media.h |   13 +
 fs/xfs/Makefile           |    1 
 fs/xfs/xfs_ioctl.c        |    3 
 fs/xfs/xfs_verify_media.c |  459 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 604 insertions(+)
 create mode 100644 fs/xfs/xfs_verify_media.h
 create mode 100644 fs/xfs/xfs_verify_media.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index a01303c5de6ce6..d165de607d179e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1160,6 +1160,34 @@ struct xfs_health_file_on_monitored_fs {
 	__u32		flags;	/* zero for now */
 };
 
+/* Verify the media of the underlying devices */
+struct xfs_verify_media {
+	__u32	me_dev;		/* I: XFS_DEV_{DATA,LOG,RT} */
+	__u32	me_flags;	/* I: XFS_VERIFY_MEDIA_* */
+
+	/*
+	 * IO: inclusive start of disk range to verify, in 512b blocks.
+	 * Will be adjusted upwards as media verification succeeds.
+	 */
+	__u64	me_start_daddr;
+
+	/*
+	 * IO: exclusive end of the disk range to verify, in 512b blocks.
+	 * Can be adjusted downwards to match device size.
+	 */
+	__u64	me_end_daddr;
+
+	__u32	me_ioerror;	/* O: I/O error (positive) */
+	__u32	me_max_io_size;	/* I: maximum IO size in bytes */
+
+	__u32	me_rest_us;	/* I: rest time between IOs, usecs */
+	__u32	me_pad;		/* zero */
+};
+
+#define XFS_VERIFY_MEDIA_REPORT	(1 << 0)	/* report to fsnotify */
+
+#define XFS_VERIFY_MEDIA_FLAGS	(XFS_VERIFY_MEDIA_REPORT)
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1202,6 +1230,8 @@ struct xfs_health_file_on_monitored_fs {
 #define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 #define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \
 				_IOW ('X', 69, struct xfs_health_file_on_monitored_fs)
+#define XFS_IOC_VERIFY_MEDIA	_IOWR('X', 70, struct xfs_verify_media)
+
 /*
  * ioctl commands that replace IRIX syssgi()'s
  */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0cf4877753584f..3483461cf46255 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6320,6 +6320,104 @@ TRACE_EVENT(xfs_healthmon_report_file_ioerror,
 		  __entry->error)
 );
 
+TRACE_EVENT(xfs_verify_media,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+		 const struct folio *folio),
+	TP_ARGS(mp, me, fdev, daddr, bbcount, folio),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(unsigned int, flags)
+		__field(xfs_daddr_t, daddr)
+		__field(uint64_t, bbcount)
+		__field(unsigned int, bufsize)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->flags = me->me_flags;
+		__entry->daddr = daddr;
+		__entry->bbcount = bbcount;
+		__entry->bufsize = folio_size(folio);
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->flags,
+		  __entry->daddr,
+		  __entry->bbcount,
+		  __entry->bufsize)
+);
+
+TRACE_EVENT(xfs_verify_media_end,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev),
+	TP_ARGS(mp, me, fdev),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(int, ioerror)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->ioerror = me->me_ioerror;
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->ioerror)
+);
+
+TRACE_EVENT(xfs_verify_media_error,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+		 blk_status_t status),
+	TP_ARGS(mp, me, fdev, daddr, bbcount, status),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(unsigned int, flags)
+		__field(xfs_daddr_t, daddr)
+		__field(uint64_t, bbcount)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->flags = me->me_flags;
+		__entry->daddr = daddr;
+		__entry->bbcount = bbcount;
+		__entry->error = blk_status_to_errno(status);
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->flags,
+		  __entry->daddr,
+		  __entry->bbcount,
+		  __entry->error)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_verify_media.h b/fs/xfs/xfs_verify_media.h
new file mode 100644
index 00000000000000..dc6eee9c88636b
--- /dev/null
+++ b/fs/xfs/xfs_verify_media.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_VERIFY_MEDIA_H__
+#define __XFS_VERIFY_MEDIA_H__
+
+struct xfs_verify_media;
+int xfs_ioc_verify_media(struct file *file,
+		struct xfs_verify_media __user *arg);
+
+#endif /* __XFS_VERIFY_MEDIA_H__ */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 1b7385e23b3463..9f7133e025768d 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,6 +106,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_symlink.o \
 				   xfs_sysfs.o \
 				   xfs_trans.o \
+				   xfs_verify_media.o \
 				   xfs_xattr.o
 
 # low-level transaction/log code
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c04c41ca924e37..80a005999d2df3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
 #include "xfs_healthmon.h"
+#include "xfs_verify_media.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1422,6 +1423,8 @@ xfs_file_ioctl(
 
 	case XFS_IOC_HEALTH_MONITOR:
 		return xfs_ioc_health_monitor(filp, arg);
+	case XFS_IOC_VERIFY_MEDIA:
+		return xfs_ioc_verify_media(filp, arg);
 
 	default:
 		return -ENOTTY;
diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
new file mode 100644
index 00000000000000..29f8cae5d7ee88
--- /dev/null
+++ b/fs/xfs/xfs_verify_media.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_ag.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_health.h"
+#include "xfs_healthmon.h"
+#include "xfs_trace.h"
+#include "xfs_verify_media.h"
+
+#include <linux/fserror.h>
+
+struct xfs_group_data_lost {
+	xfs_agblock_t		startblock;
+	xfs_extlen_t		blockcount;
+};
+
+/* Report lost file data from rmap records */
+STATIC int
+xfs_verify_report_data_lost(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*data)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_inode		*ip;
+	struct xfs_group_data_lost	*lost = data;
+	xfs_fileoff_t			fileoff = rec->rm_offset;
+	xfs_extlen_t			blocks = rec->rm_blockcount;
+	const bool			is_attr =
+			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
+	const xfs_agblock_t		lost_end =
+			lost->startblock + lost->blockcount;
+	const xfs_agblock_t		rmap_end =
+			rec->rm_startblock + rec->rm_blockcount;
+	int				error = 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+	       return 0;
+
+	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
+	if (error)
+		return 0;
+
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
+		goto out_rele;
+	}
+
+	if (is_attr) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
+		goto out_rele;
+	}
+
+	if (lost->startblock > rec->rm_startblock) {
+		fileoff += lost->startblock - rec->rm_startblock;
+		blocks -= lost->startblock - rec->rm_startblock;
+	}
+	if (rmap_end > lost_end)
+		blocks -= rmap_end - lost_end;
+
+	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
+			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
+
+out_rele:
+	xfs_irele(ip);
+	return 0;
+}
+
+/* Walk reverse mappings to look for all file data loss */
+STATIC int
+xfs_verify_report_losses(
+	struct xfs_mount	*mp,
+	enum xfs_group_type	type,
+	xfs_daddr_t		daddr,
+	u64			bblen)
+{
+	struct xfs_group	*xg = NULL;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		start_bno, end_bno;
+	uint32_t		start_gno, end_gno;
+	int			error;
+
+	if (type == XG_TYPE_RTG) {
+		start_bno = xfs_daddr_to_rtb(mp, daddr);
+		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+	} else {
+		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+	}
+
+	tp = xfs_trans_alloc_empty(mp);
+	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+		struct xfs_buf		*agf_bp = NULL;
+		struct xfs_rtgroup	*rtg = NULL;
+		struct xfs_btree_cur	*cur;
+		struct xfs_rmap_irec	ri_low = { };
+		struct xfs_rmap_irec	ri_high;
+		struct xfs_group_data_lost lost;
+
+		if (type == XG_TYPE_AG) {
+			struct xfs_perag	*pag = to_perag(xg);
+
+			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+			if (error) {
+				xfs_perag_put(pag);
+				break;
+			}
+
+			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+		} else {
+			rtg = to_rtg(xg);
+			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+		}
+
+		/*
+		 * Set the rmap range from ri_low to ri_high, which represents
+		 * a [start, end] where we looking for the files or metadata.
+		 */
+		memset(&ri_high, 0xFF, sizeof(ri_high));
+		if (xg->xg_gno == start_gno)
+			ri_low.rm_startblock =
+				xfs_fsb_to_gbno(mp, start_bno, type);
+		if (xg->xg_gno == end_gno)
+			ri_high.rm_startblock =
+				xfs_fsb_to_gbno(mp, end_bno, type);
+
+		lost.startblock = ri_low.rm_startblock;
+		lost.blockcount = min(xg->xg_block_count,
+				      ri_high.rm_startblock + 1) -
+							ri_low.rm_startblock;
+
+		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+				xfs_verify_report_data_lost, &lost);
+		xfs_btree_del_cursor(cur, error);
+		if (agf_bp)
+			xfs_trans_brelse(tp, agf_bp);
+		if (rtg)
+			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+		if (error) {
+			xfs_group_put(xg);
+			break;
+		}
+	}
+
+	xfs_trans_cancel(tp);
+	return 0;
+}
+
+/*
+ * Compute the desired verify IO size.
+ *
+ * To minimize command overhead, we'd like to create bios that are 1MB, though
+ * we allow the user to ask for a smaller size.
+ */
+STATIC unsigned int
+xfs_verify_iosize(
+	const struct xfs_verify_media	*me,
+	struct xfs_buftarg		*btp,
+	uint64_t			bbcount)
+{
+	unsigned int			iosize =
+			min_not_zero(SZ_1M, me->me_max_io_size);
+
+	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
+	ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
+
+	return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
+			BBTOB(bbcount));
+}
+
+/* Allocate as much memory as we can get for verification buffer. */
+STATIC struct folio *
+xfs_verify_alloc_folio(
+	const unsigned int	iosize)
+{
+	unsigned int		order = get_order(iosize);
+
+	while (order > 0) {
+		struct folio	*folio =
+			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+
+		if (folio)
+			return folio;
+		order--;
+	}
+
+	return folio_alloc(GFP_KERNEL, 0);
+}
+
+/* Construct a bio for doing the verification. */
+STATIC struct bio *
+xfs_verify_bio_alloc(
+	struct xfs_buftarg	*btp,
+	xfs_daddr_t		daddr,
+	uint64_t		bbcount,
+	struct folio		*folio)
+{
+	struct bio		*bio;
+
+	bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
+	if (!bio)
+		return NULL;
+
+	bio->bi_iter.bi_sector = daddr;
+	bio_add_folio_nofail(bio, folio,
+			min(bbcount << SECTOR_SHIFT, folio_size(folio)), 0);
+
+	return bio;
+}
+
+/* Report any kind of problem verifying media */
+STATIC void
+xfs_verify_media_error(
+	struct xfs_mount	*mp,
+	struct xfs_verify_media	*me,
+	struct xfs_buftarg	*btp,
+	xfs_daddr_t		daddr,
+	unsigned int		bio_bbcount,
+	blk_status_t		bio_status)
+{
+	trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
+			bio_bbcount, bio_status);
+
+	/*
+	 * Pass any I/O error up to the caller if we didn't successfully verify
+	 * any bytes at all.
+	 */
+	if (me->me_start_daddr == daddr)
+		me->me_ioerror = -blk_status_to_errno(bio_status);
+
+	/*
+	 * PI validation failures, medium errors, or general IO errors are
+	 * treated as indicators of data loss.  Everything else are (hopefully)
+	 * transient errors and are not reported.
+	 */
+	switch (bio_status) {
+	case BLK_STS_PROTECTION:
+	case BLK_STS_IOERR:
+	case BLK_STS_MEDIUM:
+		break;
+	default:
+		return;
+	}
+
+	if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
+		return;
+
+	xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
+
+	if (!xfs_has_rmapbt(mp))
+		return;
+
+	switch (me->me_dev) {
+	case XFS_DEV_DATA:
+		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
+		break;
+	case XFS_DEV_RT:
+		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
+		break;
+	}
+}
+
+/* Verify the media of an xfs device by submitting read requests to the disk. */
+STATIC int
+xfs_verify_media(
+	struct xfs_mount	*mp,
+	struct xfs_verify_media	*me)
+{
+	struct xfs_buftarg	*btp = NULL;
+	struct folio		*folio;
+	xfs_daddr_t		daddr;
+	uint64_t		bbcount;
+	int			error = 0;
+
+	me->me_ioerror = 0;
+
+	switch (me->me_dev) {
+	case XFS_DEV_DATA:
+		btp = mp->m_ddev_targp;
+		break;
+	case XFS_DEV_LOG:
+		if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
+			btp = mp->m_logdev_targp;
+		break;
+	case XFS_DEV_RT:
+		btp = mp->m_rtdev_targp;
+		break;
+	}
+	if (!btp)
+		return -ENODEV;
+
+	/*
+	 * If the caller told us to verify beyond the end of the disk, tell the
+	 * user exactly where that was.
+	 */
+	if (me->me_end_daddr > btp->bt_nr_sectors)
+		me->me_end_daddr = btp->bt_nr_sectors;
+
+	/* start and end have to be aligned to the lba size */
+	if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
+			bdev_logical_block_size(btp->bt_bdev)))
+		return -EINVAL;
+
+	/*
+	 * end_daddr is the exclusive end of the range, so if start_daddr
+	 * reaches there (or beyond), there's no work to be done.
+	 */
+	if (me->me_start_daddr >= me->me_end_daddr)
+		return 0;
+
+	/*
+	 * There are three ranges involved here:
+	 *
+	 *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
+	 *    user wants to verify.  end_daddr can be beyond the end of the
+	 *    disk; we'll constrain it to the end if necessary.
+	 *
+	 *  - [daddr, me->me_end_daddr) is the range that we have not yet
+	 *    verified.  We update daddr after each successful read.
+	 *    me->me_start_daddr is set to daddr before returning.
+	 *
+	 *  - [daddr, daddr + bio_bbcount) is the range that we're currently
+	 *    verifying.
+	 */
+	daddr = me->me_start_daddr;
+	bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
+			  me->me_start_daddr;
+
+	folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
+	if (!folio)
+		return -ENOMEM;
+
+	trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
+			folio);
+
+	while (bbcount > 0) {
+		struct bio	*bio;
+		unsigned int	bio_bbcount;
+		blk_status_t	bio_status;
+
+		bio = xfs_verify_bio_alloc(btp, daddr, bbcount, folio);
+		if (!bio) {
+			error = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * Save the length of the bio before we submit it, because we
+		 * need the original daddr and length for reporting IO errors
+		 * if the bio fails.
+		 */
+		bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+		submit_bio_wait(bio);
+		bio_status = bio->bi_status;
+		bio_put(bio);
+		if (bio_status != BLK_STS_OK) {
+			xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
+					bio_status);
+			error = 0;
+			break;
+		}
+
+		daddr += bio_bbcount;
+		bbcount -= bio_bbcount;
+
+		if (bbcount == 0)
+			break;
+
+		if (me->me_rest_us) {
+			ktime_t	expires;
+
+			expires = ktime_add_ns(ktime_get(),
+					me->me_rest_us * 1000);
+			set_current_state(TASK_KILLABLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+
+		if (fatal_signal_pending(current)) {
+			error = -EINTR;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	folio_put(folio);
+
+	if (error)
+		return error;
+
+	/*
+	 * Advance start_daddr to the end of what we verified if there wasn't
+	 * an operational error.
+	 */
+	me->me_start_daddr = daddr;
+	trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
+	return 0;
+}
+
+int
+xfs_ioc_verify_media(
+	struct file			*file,
+	struct xfs_verify_media __user	*arg)
+{
+	struct xfs_verify_media		me;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&me, arg, sizeof(me)))
+		return -EFAULT;
+
+	if (me.me_pad)
+		return -EINVAL;
+	if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
+		return -EINVAL;
+
+	switch (me.me_dev) {
+	case XFS_DEV_DATA:
+	case XFS_DEV_LOG:
+	case XFS_DEV_RT:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	error = xfs_verify_media(mp, &me);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &me, sizeof(me)))
+		return -EFAULT;
+
+	return 0;
+}


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor
  2026-01-16  5:43 ` [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor Darrick J. Wong
@ 2026-01-19 15:44   ` Christoph Hellwig
  0 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-19 15:44 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, linux-xfs, linux-fsdevel, hch

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 06/11] xfs: convey filesystem shutdown events to the health monitor
  2026-01-16  5:43 ` [PATCH 06/11] xfs: convey filesystem shutdown " Darrick J. Wong
@ 2026-01-19 15:44   ` Christoph Hellwig
  0 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-19 15:44 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, linux-xfs, linux-fsdevel, hch

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 11/11] xfs: add media verification ioctl
  2026-01-16  5:44 ` [PATCH 11/11] xfs: add media verification ioctl Darrick J. Wong
@ 2026-01-19 15:56   ` Christoph Hellwig
  2026-01-19 17:35     ` Darrick J. Wong
  2026-01-20  4:12   ` [PATCH v6.1 " Darrick J. Wong
  1 sibling, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-19 15:56 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, linux-xfs, linux-fsdevel, hch

On Thu, Jan 15, 2026 at 09:44:53PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> verify the media of the devices backing an xfs filesystem, and have any
> resulting media errors reported to fsnotify and xfs_healer.

I really wish this would explain the approach (reading data into a
kernel buffer, and the choices of the buffer size and I/O pattern)
and their rationale a bit better here.

> +
> +struct xfs_group_data_lost {
> +	xfs_agblock_t		startblock;
> +	xfs_extlen_t		blockcount;
> +};
> +
> +/* Report lost file data from rmap records */
> +STATIC int
> +xfs_verify_report_data_lost(
> +	struct xfs_btree_cur		*cur,
> +	const struct xfs_rmap_irec	*rec,
> +	void				*data)
> +{
> +	struct xfs_mount		*mp = cur->bc_mp;
> +	struct xfs_inode		*ip;
> +	struct xfs_group_data_lost	*lost = data;
> +	xfs_fileoff_t			fileoff = rec->rm_offset;
> +	xfs_extlen_t			blocks = rec->rm_blockcount;
> +	const bool			is_attr =
> +			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
> +	const xfs_agblock_t		lost_end =
> +			lost->startblock + lost->blockcount;
> +	const xfs_agblock_t		rmap_end =
> +			rec->rm_startblock + rec->rm_blockcount;
> +	int				error = 0;
> +
> +	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
> +	       return 0;
> +
> +	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
> +	if (error)
> +		return 0;
> +
> +	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
> +		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
> +		goto out_rele;
> +	}
> +
> +	if (is_attr) {
> +		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
> +		goto out_rele;
> +	}
> +
> +	if (lost->startblock > rec->rm_startblock) {
> +		fileoff += lost->startblock - rec->rm_startblock;
> +		blocks -= lost->startblock - rec->rm_startblock;
> +	}
> +	if (rmap_end > lost_end)
> +		blocks -= rmap_end - lost_end;
> +
> +	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
> +			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
> +
> +out_rele:
> +	xfs_irele(ip);
> +	return 0;
> +}
> +
> +/* Walk reverse mappings to look for all file data loss */
> +STATIC int
> +xfs_verify_report_losses(
> +	struct xfs_mount	*mp,
> +	enum xfs_group_type	type,
> +	xfs_daddr_t		daddr,
> +	u64			bblen)
> +{
> +	struct xfs_group	*xg = NULL;
> +	struct xfs_trans	*tp;
> +	xfs_fsblock_t		start_bno, end_bno;
> +	uint32_t		start_gno, end_gno;
> +	int			error;
> +
> +	if (type == XG_TYPE_RTG) {
> +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> +	} else {
> +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> +	}
> +
> +	tp = xfs_trans_alloc_empty(mp);
> +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> +		struct xfs_buf		*agf_bp = NULL;
> +		struct xfs_rtgroup	*rtg = NULL;
> +		struct xfs_btree_cur	*cur;
> +		struct xfs_rmap_irec	ri_low = { };
> +		struct xfs_rmap_irec	ri_high;
> +		struct xfs_group_data_lost lost;
> +
> +		if (type == XG_TYPE_AG) {
> +			struct xfs_perag	*pag = to_perag(xg);
> +
> +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> +			if (error) {
> +				xfs_perag_put(pag);
> +				break;
> +			}
> +
> +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> +		} else {
> +			rtg = to_rtg(xg);
> +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> +		}
> +
> +		/*
> +		 * Set the rmap range from ri_low to ri_high, which represents
> +		 * a [start, end] where we looking for the files or metadata.
> +		 */
> +		memset(&ri_high, 0xFF, sizeof(ri_high));
> +		if (xg->xg_gno == start_gno)
> +			ri_low.rm_startblock =
> +				xfs_fsb_to_gbno(mp, start_bno, type);
> +		if (xg->xg_gno == end_gno)
> +			ri_high.rm_startblock =
> +				xfs_fsb_to_gbno(mp, end_bno, type);
> +
> +		lost.startblock = ri_low.rm_startblock;
> +		lost.blockcount = min(xg->xg_block_count,
> +				      ri_high.rm_startblock + 1) -
> +							ri_low.rm_startblock;
> +
> +		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> +				xfs_verify_report_data_lost, &lost);
> +		xfs_btree_del_cursor(cur, error);
> +		if (agf_bp)
> +			xfs_trans_brelse(tp, agf_bp);
> +		if (rtg)
> +			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
> +		if (error) {
> +			xfs_group_put(xg);
> +			break;
> +		}
> +	}
> +
> +	xfs_trans_cancel(tp);
> +	return 0;
> +}
> +
> +/*
> + * Compute the desired verify IO size.
> + *
> + * To minimize command overhead, we'd like to create bios that are 1MB, though
> + * we allow the user to ask for a smaller size.
> + */
> +STATIC unsigned int
> +xfs_verify_iosize(
> +	const struct xfs_verify_media	*me,
> +	struct xfs_buftarg		*btp,
> +	uint64_t			bbcount)
> +{
> +	unsigned int			iosize =
> +			min_not_zero(SZ_1M, me->me_max_io_size);
> +
> +	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
> +	ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
> +
> +	return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
> +			BBTOB(bbcount));
> +}

> +/* Allocate as much memory as we can get for verification buffer. */
> +STATIC struct folio *

Can we please retired STATIC already?

> +STATIC void
> +xfs_verify_media_error(
> +	struct xfs_mount	*mp,
> +	struct xfs_verify_media	*me,
> +	struct xfs_buftarg	*btp,
> +	xfs_daddr_t		daddr,
> +	unsigned int		bio_bbcount,
> +	blk_status_t		bio_status)
> +{
> +	trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
> +			bio_bbcount, bio_status);
> +
> +	/*
> +	 * Pass any I/O error up to the caller if we didn't successfully verify
> +	 * any bytes at all.
> +	 */
> +	if (me->me_start_daddr == daddr)
> +		me->me_ioerror = -blk_status_to_errno(bio_status);
> +
> +	/*
> +	 * PI validation failures, medium errors, or general IO errors are
> +	 * treated as indicators of data loss.  Everything else are (hopefully)
> +	 * transient errors and are not reported.
> +	 */

But still left in me->me_ioerror.  Is that intentional?

> +	switch (me->me_dev) {
> +	case XFS_DEV_DATA:
> +		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
> +		break;
> +	case XFS_DEV_RT:
> +		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
> +		break;
> +	}

At some point we really need dev_to_group_type and vice versa helpers.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 11/11] xfs: add media verification ioctl
  2026-01-19 15:56   ` Christoph Hellwig
@ 2026-01-19 17:35     ` Darrick J. Wong
  0 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-19 17:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: cem, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 04:56:39PM +0100, Christoph Hellwig wrote:
> On Thu, Jan 15, 2026 at 09:44:53PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> > verify the media of the devices backing an xfs filesystem, and have any
> > resulting media errors reported to fsnotify and xfs_healer.
> 
> I really wish this would explain the approach (reading data into a
> kernel buffer, and the choices of the buffer size and I/O pattern)
> and their rationale a bit better here.

How about:

"Add a new privileged ioctl so that xfs_scrub can ask the kernel to
verify the media of the devices backing an xfs filesystem, and have any
resulting media errors reported to fsnotify and xfs_healer.

"To accomplish this, the kernel allocates a folio between the base page
size and 1MB, and issues read IOs to a gradually incrementing range of
one of the storage devices underlying an xfs filesystem.  If any error
occurs, that raw error is reported to the calling process.  If the error
happens to be one of the ones that the kernel considers indicative of
data loss, then it will also be reported to xfs_healthmon and fsnotify.

"Driving the verification from the kernel enables xfs (and by extension
xfs_scrub) to have precise control over the size and error handling of
IOs that are issued to the underlying block device, and to emit
notifications about problems to other relevant kernel subsystems
immediately.

"Note that the caller is also allowed to reduce the size of the IO and
to ask for a relaxation period after each IO."

> > +
> > +struct xfs_group_data_lost {
> > +	xfs_agblock_t		startblock;
> > +	xfs_extlen_t		blockcount;
> > +};
> > +
> > +/* Report lost file data from rmap records */
> > +STATIC int
> > +xfs_verify_report_data_lost(
> > +	struct xfs_btree_cur		*cur,
> > +	const struct xfs_rmap_irec	*rec,
> > +	void				*data)
> > +{
> > +	struct xfs_mount		*mp = cur->bc_mp;
> > +	struct xfs_inode		*ip;
> > +	struct xfs_group_data_lost	*lost = data;
> > +	xfs_fileoff_t			fileoff = rec->rm_offset;
> > +	xfs_extlen_t			blocks = rec->rm_blockcount;
> > +	const bool			is_attr =
> > +			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
> > +	const xfs_agblock_t		lost_end =
> > +			lost->startblock + lost->blockcount;
> > +	const xfs_agblock_t		rmap_end =
> > +			rec->rm_startblock + rec->rm_blockcount;
> > +	int				error = 0;
> > +
> > +	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
> > +	       return 0;
> > +
> > +	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
> > +	if (error)
> > +		return 0;
> > +
> > +	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
> > +		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
> > +		goto out_rele;
> > +	}
> > +
> > +	if (is_attr) {
> > +		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
> > +		goto out_rele;
> > +	}
> > +
> > +	if (lost->startblock > rec->rm_startblock) {
> > +		fileoff += lost->startblock - rec->rm_startblock;
> > +		blocks -= lost->startblock - rec->rm_startblock;
> > +	}
> > +	if (rmap_end > lost_end)
> > +		blocks -= rmap_end - lost_end;
> > +
> > +	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
> > +			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
> > +
> > +out_rele:
> > +	xfs_irele(ip);
> > +	return 0;
> > +}
> > +
> > +/* Walk reverse mappings to look for all file data loss */
> > +STATIC int
> > +xfs_verify_report_losses(
> > +	struct xfs_mount	*mp,
> > +	enum xfs_group_type	type,
> > +	xfs_daddr_t		daddr,
> > +	u64			bblen)
> > +{
> > +	struct xfs_group	*xg = NULL;
> > +	struct xfs_trans	*tp;
> > +	xfs_fsblock_t		start_bno, end_bno;
> > +	uint32_t		start_gno, end_gno;
> > +	int			error;
> > +
> > +	if (type == XG_TYPE_RTG) {
> > +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> > +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> > +	} else {
> > +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> > +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> > +	}
> > +
> > +	tp = xfs_trans_alloc_empty(mp);
> > +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> > +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> > +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> > +		struct xfs_buf		*agf_bp = NULL;
> > +		struct xfs_rtgroup	*rtg = NULL;
> > +		struct xfs_btree_cur	*cur;
> > +		struct xfs_rmap_irec	ri_low = { };
> > +		struct xfs_rmap_irec	ri_high;
> > +		struct xfs_group_data_lost lost;
> > +
> > +		if (type == XG_TYPE_AG) {
> > +			struct xfs_perag	*pag = to_perag(xg);
> > +
> > +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> > +			if (error) {
> > +				xfs_perag_put(pag);
> > +				break;
> > +			}
> > +
> > +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> > +		} else {
> > +			rtg = to_rtg(xg);
> > +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> > +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> > +		}
> > +
> > +		/*
> > +		 * Set the rmap range from ri_low to ri_high, which represents
> > +		 * a [start, end] where we looking for the files or metadata.
> > +		 */
> > +		memset(&ri_high, 0xFF, sizeof(ri_high));
> > +		if (xg->xg_gno == start_gno)
> > +			ri_low.rm_startblock =
> > +				xfs_fsb_to_gbno(mp, start_bno, type);
> > +		if (xg->xg_gno == end_gno)
> > +			ri_high.rm_startblock =
> > +				xfs_fsb_to_gbno(mp, end_bno, type);
> > +
> > +		lost.startblock = ri_low.rm_startblock;
> > +		lost.blockcount = min(xg->xg_block_count,
> > +				      ri_high.rm_startblock + 1) -
> > +							ri_low.rm_startblock;
> > +
> > +		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> > +				xfs_verify_report_data_lost, &lost);
> > +		xfs_btree_del_cursor(cur, error);
> > +		if (agf_bp)
> > +			xfs_trans_brelse(tp, agf_bp);
> > +		if (rtg)
> > +			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
> > +		if (error) {
> > +			xfs_group_put(xg);
> > +			break;
> > +		}
> > +	}
> > +
> > +	xfs_trans_cancel(tp);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Compute the desired verify IO size.
> > + *
> > + * To minimize command overhead, we'd like to create bios that are 1MB, though
> > + * we allow the user to ask for a smaller size.
> > + */
> > +STATIC unsigned int
> > +xfs_verify_iosize(
> > +	const struct xfs_verify_media	*me,
> > +	struct xfs_buftarg		*btp,
> > +	uint64_t			bbcount)
> > +{
> > +	unsigned int			iosize =
> > +			min_not_zero(SZ_1M, me->me_max_io_size);
> > +
> > +	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
> > +	ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
> > +
> > +	return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
> > +			BBTOB(bbcount));
> > +}
> 
> > +/* Allocate as much memory as we can get for verification buffer. */
> > +STATIC struct folio *
> 
> Can we please retired STATIC already?

Ok.

> > +STATIC void
> > +xfs_verify_media_error(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_verify_media	*me,
> > +	struct xfs_buftarg	*btp,
> > +	xfs_daddr_t		daddr,
> > +	unsigned int		bio_bbcount,
> > +	blk_status_t		bio_status)
> > +{
> > +	trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
> > +			bio_bbcount, bio_status);
> > +
> > +	/*
> > +	 * Pass any I/O error up to the caller if we didn't successfully verify

"Pass any error, I/O or otherwise, up to the caller..."

> > +	 * any bytes at all.
> > +	 */
> > +	if (me->me_start_daddr == daddr)
> > +		me->me_ioerror = -blk_status_to_errno(bio_status);
> > +
> > +	/*
> > +	 * PI validation failures, medium errors, or general IO errors are
> > +	 * treated as indicators of data loss.  Everything else are (hopefully)
> > +	 * transient errors and are not reported.
> > +	 */
> 
> But still left in me->me_ioerror.  Is that intentional?

Yeah.  All errors are reported to the ioctl caller, but only the ones
that sound like data loss get passed to healthmon/fsnotify.

> > +	switch (me->me_dev) {
> > +	case XFS_DEV_DATA:
> > +		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
> > +		break;
> > +	case XFS_DEV_RT:
> > +		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
> > +		break;
> > +	}
> 
> At some point we really need dev_to_group_type and vice versa helpers.

Heh, yes.  Do you want that /now/ or when we add a second user?

It also occurs to me that I could probably speed up the verify code a
tiny bit more by hoisting the bio variable scope outside the loop and
using bio_reuse to reset bi_iter and bi_sector, rather than freeing it
and allocating a new one.

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-16  5:44 ` [PATCH 11/11] xfs: add media verification ioctl Darrick J. Wong
  2026-01-19 15:56   ` Christoph Hellwig
@ 2026-01-20  4:12   ` Darrick J. Wong
  2026-01-20  7:18     ` Christoph Hellwig
  2026-02-06  3:01     ` Chris Mason
  1 sibling, 2 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-20  4:12 UTC (permalink / raw)
  To: cem, hch; +Cc: linux-xfs, linux-fsdevel

From: Darrick J. Wong <djwong@kernel.org>

Add a new privileged ioctl so that xfs_scrub can ask the kernel to
verify the media of the devices backing an xfs filesystem, and have any
resulting media errors reported to fsnotify and xfs_healer.

To accomplish this, the kernel allocates a folio between the base page
size and 1MB, and issues read IOs to a gradually incrementing range of
one of the storage devices underlying an xfs filesystem.  If any error
occurs, that raw error is reported to the calling process.  If the error
happens to be one of the ones that the kernel considers indicative of
data loss, then it will also be reported to xfs_healthmon and fsnotify.

Driving the verification from the kernel enables xfs (and by extension
xfs_scrub) to have precise control over the size and error handling of
IOs that are issued to the underlying block device, and to emit
notifications about problems to other relevant kernel subsystems
immediately.

Note that the caller is also allowed to reduce the size of the IO and
to ask for a relaxation period after each IO.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
v6.1: improve commit message, clarify comments about error handling,
      and reuse the bio instead of repeatedly allocating new ones
---
 fs/xfs/libxfs/xfs_fs.h    |   30 +++
 fs/xfs/xfs_trace.h        |   98 ++++++++++
 fs/xfs/xfs_verify_media.h |   13 +
 fs/xfs/Makefile           |    1 
 fs/xfs/xfs_ioctl.c        |    3 
 fs/xfs/xfs_verify_media.c |  445 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 590 insertions(+)
 create mode 100644 fs/xfs/xfs_verify_media.h
 create mode 100644 fs/xfs/xfs_verify_media.c

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index a01303c5de6ce6..d165de607d179e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1160,6 +1160,34 @@ struct xfs_health_file_on_monitored_fs {
 	__u32		flags;	/* zero for now */
 };
 
+/* Verify the media of the underlying devices */
+struct xfs_verify_media {
+	__u32	me_dev;		/* I: XFS_DEV_{DATA,LOG,RT} */
+	__u32	me_flags;	/* I: XFS_VERIFY_MEDIA_* */
+
+	/*
+	 * IO: inclusive start of disk range to verify, in 512b blocks.
+	 * Will be adjusted upwards as media verification succeeds.
+	 */
+	__u64	me_start_daddr;
+
+	/*
+	 * IO: exclusive end of the disk range to verify, in 512b blocks.
+	 * Can be adjusted downwards to match device size.
+	 */
+	__u64	me_end_daddr;
+
+	__u32	me_ioerror;	/* O: I/O error (positive) */
+	__u32	me_max_io_size;	/* I: maximum IO size in bytes */
+
+	__u32	me_rest_us;	/* I: rest time between IOs, usecs */
+	__u32	me_pad;		/* zero */
+};
+
+#define XFS_VERIFY_MEDIA_REPORT	(1 << 0)	/* report to fsnotify */
+
+#define XFS_VERIFY_MEDIA_FLAGS	(XFS_VERIFY_MEDIA_REPORT)
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1202,6 +1230,8 @@ struct xfs_health_file_on_monitored_fs {
 #define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 #define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \
 				_IOW ('X', 69, struct xfs_health_file_on_monitored_fs)
+#define XFS_IOC_VERIFY_MEDIA	_IOWR('X', 70, struct xfs_verify_media)
+
 /*
  * ioctl commands that replace IRIX syssgi()'s
  */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0cf4877753584f..3483461cf46255 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -6320,6 +6320,104 @@ TRACE_EVENT(xfs_healthmon_report_file_ioerror,
 		  __entry->error)
 );
 
+TRACE_EVENT(xfs_verify_media,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+		 const struct folio *folio),
+	TP_ARGS(mp, me, fdev, daddr, bbcount, folio),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(unsigned int, flags)
+		__field(xfs_daddr_t, daddr)
+		__field(uint64_t, bbcount)
+		__field(unsigned int, bufsize)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->flags = me->me_flags;
+		__entry->daddr = daddr;
+		__entry->bbcount = bbcount;
+		__entry->bufsize = folio_size(folio);
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->flags,
+		  __entry->daddr,
+		  __entry->bbcount,
+		  __entry->bufsize)
+);
+
+TRACE_EVENT(xfs_verify_media_end,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev),
+	TP_ARGS(mp, me, fdev),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(int, ioerror)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->ioerror = me->me_ioerror;
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->ioerror)
+);
+
+TRACE_EVENT(xfs_verify_media_error,
+	TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me,
+		 dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount,
+		 blk_status_t status),
+	TP_ARGS(mp, me, fdev, daddr, bbcount, status),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, fdev)
+		__field(xfs_daddr_t, start_daddr)
+		__field(xfs_daddr_t, end_daddr)
+		__field(unsigned int, flags)
+		__field(xfs_daddr_t, daddr)
+		__field(uint64_t, bbcount)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_ddev_targp->bt_dev;
+		__entry->fdev = fdev;
+		__entry->start_daddr = me->me_start_daddr;
+		__entry->end_daddr = me->me_end_daddr;
+		__entry->flags = me->me_flags;
+		__entry->daddr = daddr;
+		__entry->bbcount = bbcount;
+		__entry->error = blk_status_to_errno(status);
+	),
+	TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->fdev), MINOR(__entry->fdev),
+		  __entry->start_daddr,
+		  __entry->end_daddr,
+		  __entry->flags,
+		  __entry->daddr,
+		  __entry->bbcount,
+		  __entry->error)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_verify_media.h b/fs/xfs/xfs_verify_media.h
new file mode 100644
index 00000000000000..dc6eee9c88636b
--- /dev/null
+++ b/fs/xfs/xfs_verify_media.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_VERIFY_MEDIA_H__
+#define __XFS_VERIFY_MEDIA_H__
+
+struct xfs_verify_media;
+int xfs_ioc_verify_media(struct file *file,
+		struct xfs_verify_media __user *arg);
+
+#endif /* __XFS_VERIFY_MEDIA_H__ */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d14f5ae2b980fe..7eadc263f728a2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -197,6 +197,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_symlink.o \
 				   xfs_sysfs.o \
 				   xfs_trans.o \
+				   xfs_verify_media.o \
 				   xfs_xattr.o
 
 # low-level transaction/log code
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c04c41ca924e37..80a005999d2df3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
 #include "xfs_healthmon.h"
+#include "xfs_verify_media.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1422,6 +1423,8 @@ xfs_file_ioctl(
 
 	case XFS_IOC_HEALTH_MONITOR:
 		return xfs_ioc_health_monitor(filp, arg);
+	case XFS_IOC_VERIFY_MEDIA:
+		return xfs_ioc_verify_media(filp, arg);
 
 	default:
 		return -ENOTTY;
diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
new file mode 100644
index 00000000000000..f4f620c98d92ca
--- /dev/null
+++ b/fs/xfs/xfs_verify_media.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_ag.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_health.h"
+#include "xfs_healthmon.h"
+#include "xfs_trace.h"
+#include "xfs_verify_media.h"
+
+#include <linux/fserror.h>
+
+struct xfs_group_data_lost {
+	xfs_agblock_t		startblock;
+	xfs_extlen_t		blockcount;
+};
+
+/* Report lost file data from rmap records */
+static int
+xfs_verify_report_data_lost(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*data)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_inode		*ip;
+	struct xfs_group_data_lost	*lost = data;
+	xfs_fileoff_t			fileoff = rec->rm_offset;
+	xfs_extlen_t			blocks = rec->rm_blockcount;
+	const bool			is_attr =
+			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
+	const xfs_agblock_t		lost_end =
+			lost->startblock + lost->blockcount;
+	const xfs_agblock_t		rmap_end =
+			rec->rm_startblock + rec->rm_blockcount;
+	int				error = 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+	       return 0;
+
+	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
+	if (error)
+		return 0;
+
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
+		goto out_rele;
+	}
+
+	if (is_attr) {
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
+		goto out_rele;
+	}
+
+	if (lost->startblock > rec->rm_startblock) {
+		fileoff += lost->startblock - rec->rm_startblock;
+		blocks -= lost->startblock - rec->rm_startblock;
+	}
+	if (rmap_end > lost_end)
+		blocks -= rmap_end - lost_end;
+
+	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
+			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
+
+out_rele:
+	xfs_irele(ip);
+	return 0;
+}
+
+/* Walk reverse mappings to look for all file data loss */
+static int
+xfs_verify_report_losses(
+	struct xfs_mount	*mp,
+	enum xfs_group_type	type,
+	xfs_daddr_t		daddr,
+	u64			bblen)
+{
+	struct xfs_group	*xg = NULL;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		start_bno, end_bno;
+	uint32_t		start_gno, end_gno;
+	int			error;
+
+	if (type == XG_TYPE_RTG) {
+		start_bno = xfs_daddr_to_rtb(mp, daddr);
+		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+	} else {
+		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
+		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
+	}
+
+	tp = xfs_trans_alloc_empty(mp);
+	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
+	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
+	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
+		struct xfs_buf		*agf_bp = NULL;
+		struct xfs_rtgroup	*rtg = NULL;
+		struct xfs_btree_cur	*cur;
+		struct xfs_rmap_irec	ri_low = { };
+		struct xfs_rmap_irec	ri_high;
+		struct xfs_group_data_lost lost;
+
+		if (type == XG_TYPE_AG) {
+			struct xfs_perag	*pag = to_perag(xg);
+
+			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+			if (error) {
+				xfs_perag_put(pag);
+				break;
+			}
+
+			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+		} else {
+			rtg = to_rtg(xg);
+			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+		}
+
+		/*
+		 * Set the rmap range from ri_low to ri_high, which represents
+		 * a [start, end] where we looking for the files or metadata.
+		 */
+		memset(&ri_high, 0xFF, sizeof(ri_high));
+		if (xg->xg_gno == start_gno)
+			ri_low.rm_startblock =
+				xfs_fsb_to_gbno(mp, start_bno, type);
+		if (xg->xg_gno == end_gno)
+			ri_high.rm_startblock =
+				xfs_fsb_to_gbno(mp, end_bno, type);
+
+		lost.startblock = ri_low.rm_startblock;
+		lost.blockcount = min(xg->xg_block_count,
+				      ri_high.rm_startblock + 1) -
+							ri_low.rm_startblock;
+
+		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+				xfs_verify_report_data_lost, &lost);
+		xfs_btree_del_cursor(cur, error);
+		if (agf_bp)
+			xfs_trans_brelse(tp, agf_bp);
+		if (rtg)
+			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+		if (error) {
+			xfs_group_put(xg);
+			break;
+		}
+	}
+
+	xfs_trans_cancel(tp);
+	return 0;
+}
+
+/*
+ * Compute the desired verify IO size.
+ *
+ * To minimize command overhead, we'd like to create bios that are 1MB, though
+ * we allow the user to ask for a smaller size.
+ */
+static unsigned int
+xfs_verify_iosize(
+	const struct xfs_verify_media	*me,
+	struct xfs_buftarg		*btp,
+	uint64_t			bbcount)
+{
+	unsigned int			iosize =
+			min_not_zero(SZ_1M, me->me_max_io_size);
+
+	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
+	ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
+
+	return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
+			BBTOB(bbcount));
+}
+
+/* Allocate as much memory as we can get for verification buffer. */
+static struct folio *
+xfs_verify_alloc_folio(
+	const unsigned int	iosize)
+{
+	unsigned int		order = get_order(iosize);
+
+	while (order > 0) {
+		struct folio	*folio =
+			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+
+		if (folio)
+			return folio;
+		order--;
+	}
+
+	return folio_alloc(GFP_KERNEL, 0);
+}
+
+/* Report any kind of problem verifying media */
+static void
+xfs_verify_media_error(
+	struct xfs_mount	*mp,
+	struct xfs_verify_media	*me,
+	struct xfs_buftarg	*btp,
+	xfs_daddr_t		daddr,
+	unsigned int		bio_bbcount,
+	blk_status_t		bio_status)
+{
+	trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
+			bio_bbcount, bio_status);
+
+	/*
+	 * Pass any error, I/O or otherwise, up to the caller if we didn't
+	 * successfully verify any bytes at all.
+	 */
+	if (me->me_start_daddr == daddr)
+		me->me_ioerror = -blk_status_to_errno(bio_status);
+
+	/*
+	 * PI validation failures, medium errors, or general IO errors are
+	 * treated as indicators of data loss.  Everything else are (hopefully)
+	 * transient errors and are not reported to healthmon or fsnotify.
+	 */
+	switch (bio_status) {
+	case BLK_STS_PROTECTION:
+	case BLK_STS_IOERR:
+	case BLK_STS_MEDIUM:
+		break;
+	default:
+		return;
+	}
+
+	if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
+		return;
+
+	xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
+
+	if (!xfs_has_rmapbt(mp))
+		return;
+
+	switch (me->me_dev) {
+	case XFS_DEV_DATA:
+		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
+		break;
+	case XFS_DEV_RT:
+		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
+		break;
+	}
+}
+
+/* Verify the media of an xfs device by submitting read requests to the disk. */
+static int
+xfs_verify_media(
+	struct xfs_mount	*mp,
+	struct xfs_verify_media	*me)
+{
+	struct xfs_buftarg	*btp = NULL;
+	struct bio		*bio;
+	struct folio		*folio;
+	xfs_daddr_t		daddr;
+	uint64_t		bbcount;
+	int			error = 0;
+
+	me->me_ioerror = 0;
+
+	switch (me->me_dev) {
+	case XFS_DEV_DATA:
+		btp = mp->m_ddev_targp;
+		break;
+	case XFS_DEV_LOG:
+		if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
+			btp = mp->m_logdev_targp;
+		break;
+	case XFS_DEV_RT:
+		btp = mp->m_rtdev_targp;
+		break;
+	}
+	if (!btp)
+		return -ENODEV;
+
+	/*
+	 * If the caller told us to verify beyond the end of the disk, tell the
+	 * user exactly where that was.
+	 */
+	if (me->me_end_daddr > btp->bt_nr_sectors)
+		me->me_end_daddr = btp->bt_nr_sectors;
+
+	/* start and end have to be aligned to the lba size */
+	if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
+			bdev_logical_block_size(btp->bt_bdev)))
+		return -EINVAL;
+
+	/*
+	 * end_daddr is the exclusive end of the range, so if start_daddr
+	 * reaches there (or beyond), there's no work to be done.
+	 */
+	if (me->me_start_daddr >= me->me_end_daddr)
+		return 0;
+
+	/*
+	 * There are three ranges involved here:
+	 *
+	 *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
+	 *    user wants to verify.  end_daddr can be beyond the end of the
+	 *    disk; we'll constrain it to the end if necessary.
+	 *
+	 *  - [daddr, me->me_end_daddr) is the range that we have not yet
+	 *    verified.  We update daddr after each successful read.
+	 *    me->me_start_daddr is set to daddr before returning.
+	 *
+	 *  - [daddr, daddr + bio_bbcount) is the range that we're currently
+	 *    verifying.
+	 */
+	daddr = me->me_start_daddr;
+	bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
+			  me->me_start_daddr;
+
+	folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
+	if (!folio)
+		return -ENOMEM;
+
+	trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
+			folio);
+
+	bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
+	if (!bio) {
+		error = -ENOMEM;
+		goto out_folio;
+	}
+
+	while (bbcount > 0) {
+		unsigned int	bio_bbcount;
+		blk_status_t	bio_status;
+
+		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
+		bio->bi_iter.bi_sector = daddr;
+		bio_add_folio_nofail(bio, folio,
+				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
+				0);
+
+		/*
+		 * Save the length of the bio before we submit it, because we
+		 * need the original daddr and length for reporting IO errors
+		 * if the bio fails.
+		 */
+		bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+		submit_bio_wait(bio);
+		bio_status = bio->bi_status;
+		if (bio_status != BLK_STS_OK) {
+			xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
+					bio_status);
+			error = 0;
+			break;
+		}
+
+		daddr += bio_bbcount;
+		bbcount -= bio_bbcount;
+
+		if (bbcount == 0)
+			break;
+
+		if (me->me_rest_us) {
+			ktime_t	expires;
+
+			expires = ktime_add_ns(ktime_get(),
+					me->me_rest_us * 1000);
+			set_current_state(TASK_KILLABLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
+
+		if (fatal_signal_pending(current)) {
+			error = -EINTR;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	bio_put(bio);
+out_folio:
+	folio_put(folio);
+
+	if (error)
+		return error;
+
+	/*
+	 * Advance start_daddr to the end of what we verified if there wasn't
+	 * an operational error.
+	 */
+	me->me_start_daddr = daddr;
+	trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
+	return 0;
+}
+
+int
+xfs_ioc_verify_media(
+	struct file			*file,
+	struct xfs_verify_media __user	*arg)
+{
+	struct xfs_verify_media		me;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&me, arg, sizeof(me)))
+		return -EFAULT;
+
+	if (me.me_pad)
+		return -EINVAL;
+	if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
+		return -EINVAL;
+
+	switch (me.me_dev) {
+	case XFS_DEV_DATA:
+	case XFS_DEV_LOG:
+	case XFS_DEV_RT:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	error = xfs_verify_media(mp, &me);
+	if (error)
+		return error;
+
+	if (copy_to_user(arg, &me, sizeof(me)))
+		return -EFAULT;
+
+	return 0;
+}

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-20  4:12   ` [PATCH v6.1 " Darrick J. Wong
@ 2026-01-20  7:18     ` Christoph Hellwig
  2026-01-20 18:00       ` Darrick J. Wong
  2026-02-06  3:01     ` Chris Mason
  1 sibling, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-20  7:18 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel


> +		unsigned int	bio_bbcount;
> +		blk_status_t	bio_status;
> +
> +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> +		bio->bi_iter.bi_sector = daddr;
> +		bio_add_folio_nofail(bio, folio,
> +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> +				0);

You could actually use bio_reuse as you implied in the previous mail here
and save the bio_add_folio_nofail call.  Not really going to make much
of a difference, so:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-20  7:18     ` Christoph Hellwig
@ 2026-01-20 18:00       ` Darrick J. Wong
  2026-01-21  7:05         ` Christoph Hellwig
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-20 18:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: cem, linux-xfs, linux-fsdevel

On Tue, Jan 20, 2026 at 08:18:30AM +0100, Christoph Hellwig wrote:
> 
> > +		unsigned int	bio_bbcount;
> > +		blk_status_t	bio_status;
> > +
> > +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> > +		bio->bi_iter.bi_sector = daddr;
> > +		bio_add_folio_nofail(bio, folio,
> > +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> > +				0);
> 
> You could actually use bio_reuse as you implied in the previous mail here
> and save the bio_add_folio_nofail call.  Not really going to make much
> of a difference, so:

Hrm.  Is that bio_reuse patch queued for upstream?  Though maybe it'd be
easier to make a mental note (ha!) to clean this up once both appear
upstream.

> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thanks!

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-21  6:34 [PATCHSET v7 1/3] xfs: autonomous self healing of filesystems Darrick J. Wong
@ 2026-01-21  6:35 ` Darrick J. Wong
  0 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-21  6:35 UTC (permalink / raw)
  To: djwong, cem; +Cc: hch, linux-fsdevel, linux-xfs, hch

From: Darrick J. Wong <djwong@kernel.org>

Start creating helper functions and infrastructure to pass filesystem
health events to a health monitoring file.  Since this is an
administrative interface, we only support a single health monitor
process per filesystem, so we don't need to use anything fancy such as
notifier chains (== tons of indirect calls).

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |    7 +
 fs/xfs/xfs_healthmon.h |   36 +++++++
 fs/xfs/xfs_mount.h     |    4 +
 fs/xfs/Makefile        |    1 
 fs/xfs/xfs_health.c    |    1 
 fs/xfs/xfs_healthmon.c |  262 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_mount.c     |    2 
 8 files changed, 317 insertions(+)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_healthmon.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 12463ba766da05..c58e55b3df4099 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,6 +1003,12 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+struct xfs_health_monitor {
+	__u64	flags;		/* flags */
+	__u8	format;		/* output format */
+	__u8	pad[23];	/* zeroes */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1042,6 +1048,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
+#define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
new file mode 100644
index 00000000000000..218d5aac87b012
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_HEALTHMON_H__
+#define __XFS_HEALTHMON_H__
+
+struct xfs_healthmon {
+	/*
+	 * Weak reference to the xfs filesystem that is being monitored.  It
+	 * will be set to zero when the filesystem detaches from the monitor.
+	 * Do not dereference this pointer.
+	 */
+	uintptr_t			mount_cookie;
+
+	/*
+	 * Device number of the filesystem being monitored.  This is for
+	 * consistent tracing even after unmount.
+	 */
+	dev_t				dev;
+
+	/*
+	 * Reference count of this structure.  The open healthmon fd holds one
+	 * ref, the xfs_mount holds another ref if it points to this object,
+	 * and running event handlers hold their own refs.
+	 */
+	refcount_t			ref;
+};
+
+void xfs_healthmon_unmount(struct xfs_mount *mp);
+
+long xfs_ioc_health_monitor(struct file *file,
+		struct xfs_health_monitor __user *arg);
+
+#endif /* __XFS_HEALTHMON_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b871dfde372b52..61c71128d171cb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_healthmon;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
@@ -342,6 +343,9 @@ typedef struct xfs_mount {
 
 	/* Hook to feed dirent updates to an active online repair. */
 	struct xfs_hooks	m_dir_update_hooks;
+
+	/* Private data referring to a health monitor object. */
+	struct xfs_healthmon	*m_healthmon;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bf501cf827172..1b7385e23b3463 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_handle.o \
 				   xfs_health.o \
+				   xfs_healthmon.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index fbb8886c72fe5e..3d50397f8f7c00 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
new file mode 100644
index 00000000000000..b7095ea55897c5
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/eventpoll.h>
+#include <linux/poll.h>
+
+/*
+ * Live Health Monitoring
+ * ======================
+ *
+ * Autonomous self-healing of XFS filesystems requires a means for the kernel
+ * to send filesystem health events to a monitoring daemon in userspace.  To
+ * accomplish this, we establish a thread_with_file kthread object to handle
+ * translating internal events about filesystem health into a format that can
+ * be parsed easily by userspace.  When those internal events occur, the core
+ * filesystem code calls this health monitor to convey the events to userspace.
+ * Userspace reads events from the file descriptor returned by the ioctl.
+ *
+ * The healthmon abstraction has a weak reference to the host filesystem mount
+ * so that the queueing and processing of the events do not pin the mount and
+ * cannot slow down the main filesystem.  The healthmon object can exist past
+ * the end of the filesystem mount.
+ */
+
+/* sign of a detached health monitor */
+#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
+
+/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+static DEFINE_SPINLOCK(xfs_healthmon_lock);
+
+/* Grab a reference to the healthmon object for a given mount, if any. */
+static struct xfs_healthmon *
+xfs_healthmon_get(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm;
+
+	rcu_read_lock();
+	hm = mp->m_healthmon;
+	if (hm && !refcount_inc_not_zero(&hm->ref))
+		hm = NULL;
+	rcu_read_unlock();
+
+	return hm;
+}
+
+/*
+ * Release the reference to a healthmon object.  If there are no more holders,
+ * free the health monitor after an RCU grace period to eliminate possibility
+ * of races with xfs_healthmon_get.
+ */
+static void
+xfs_healthmon_put(
+	struct xfs_healthmon		*hm)
+{
+	if (refcount_dec_and_test(&hm->ref))
+		kfree_rcu_mightsleep(hm);
+}
+
+/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+STATIC int
+xfs_healthmon_attach(
+	struct xfs_mount	*mp,
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (mp->m_healthmon != NULL) {
+		spin_unlock(&xfs_healthmon_lock);
+		return -EEXIST;
+	}
+
+	refcount_inc(&hm->ref);
+	mp->m_healthmon = hm;
+	hm->mount_cookie = (uintptr_t)mp->m_super;
+	spin_unlock(&xfs_healthmon_lock);
+
+	return 0;
+}
+
+/* Detach a xfs mount from a specific healthmon instance. */
+STATIC void
+xfs_healthmon_detach(
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+		spin_unlock(&xfs_healthmon_lock);
+		return;
+	}
+
+	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+	spin_unlock(&xfs_healthmon_lock);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Detach the xfs mount from this healthmon instance. */
+void
+xfs_healthmon_unmount(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	xfs_healthmon_detach(hm);
+	xfs_healthmon_put(hm);
+}
+
+STATIC ssize_t
+xfs_healthmon_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	return -EIO;
+}
+
+/* Free the health monitoring information. */
+STATIC int
+xfs_healthmon_release(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	/*
+	 * We might be closing the healthmon file before the filesystem
+	 * unmounts, because userspace processes can terminate at any time and
+	 * for any reason.  Null out xfs_mount::m_healthmon so that another
+	 * process can create another health monitor file.
+	 */
+	xfs_healthmon_detach(hm);
+
+	xfs_healthmon_put(hm);
+	return 0;
+}
+
+/* Validate ioctl parameters. */
+static inline bool
+xfs_healthmon_validate(
+	const struct xfs_health_monitor	*hmo)
+{
+	if (hmo->flags)
+		return false;
+	if (hmo->format)
+		return false;
+	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+		return false;
+	return true;
+}
+
+/* Emit some data about the health monitoring fd. */
+static void
+xfs_healthmon_show_fdinfo(
+	struct seq_file		*m,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+				"dead" : "alive",
+			MAJOR(hm->dev), MINOR(hm->dev));
+}
+
+static const struct file_operations xfs_healthmon_fops = {
+	.owner		= THIS_MODULE,
+	.show_fdinfo	= xfs_healthmon_show_fdinfo,
+	.read_iter	= xfs_healthmon_read_iter,
+	.release	= xfs_healthmon_release,
+};
+
+/*
+ * Create a health monitoring file.  Returns an index to the fd table or a
+ * negative errno.
+ */
+long
+xfs_ioc_health_monitor(
+	struct file			*file,
+	struct xfs_health_monitor __user *arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				ret;
+
+	/*
+	 * The only intended user of the health monitoring system should be the
+	 * xfs_healer daemon running on behalf of the whole filesystem in the
+	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
+	 * (they can use fsnotify) nor do we allow containers.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (ip->i_ino != mp->m_sb.sb_rootino)
+		return -EPERM;
+	if (current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+	if (!hm)
+		return -ENOMEM;
+	hm->dev = mp->m_super->s_dev;
+	refcount_set(&hm->ref, 1);
+
+	/*
+	 * Try to attach this health monitor to the xfs_mount.  The monitor is
+	 * considered live and will receive events if this succeeds.
+	 */
+	ret = xfs_healthmon_attach(mp, hm);
+	if (ret)
+		goto out_hm;
+
+	/*
+	 * Create the anonymous file and install a fd for it.  If it succeeds,
+	 * the file owns hm and can go away at any time, so we must not access
+	 * it again.  This must go last because we can't undo a fd table
+	 * installation.
+	 */
+	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+			O_CLOEXEC | O_RDONLY);
+	if (ret < 0)
+		goto out_mp;
+
+	return ret;
+
+out_mp:
+	xfs_healthmon_detach(hm);
+out_hm:
+	ASSERT(refcount_read(&hm->ref) == 1);
+	xfs_healthmon_put(hm);
+	return ret;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad77437181..c04c41ca924e37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1419,6 +1420,9 @@ xfs_file_ioctl(
 	case XFS_IOC_COMMIT_RANGE:
 		return xfs_ioc_commit_range(filp, arg);
 
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_ioc_health_monitor(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0953f6ae94abc8..ab67c91915384c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -625,6 +626,7 @@ xfs_unmount_flush_inodes(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
+	xfs_healthmon_unmount(mp);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-20 18:00       ` Darrick J. Wong
@ 2026-01-21  7:05         ` Christoph Hellwig
  2026-01-21 19:58           ` Darrick J. Wong
  0 siblings, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2026-01-21  7:05 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, cem, linux-xfs, linux-fsdevel

On Tue, Jan 20, 2026 at 10:00:40AM -0800, Darrick J. Wong wrote:
> On Tue, Jan 20, 2026 at 08:18:30AM +0100, Christoph Hellwig wrote:
> > 
> > > +		unsigned int	bio_bbcount;
> > > +		blk_status_t	bio_status;
> > > +
> > > +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> > > +		bio->bi_iter.bi_sector = daddr;
> > > +		bio_add_folio_nofail(bio, folio,
> > > +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> > > +				0);
> > 
> > You could actually use bio_reuse as you implied in the previous mail here
> > and save the bio_add_folio_nofail call.  Not really going to make much
> > of a difference, so:
> 
> Hrm.  Is that bio_reuse patch queued for upstream?  Though maybe it'd be
> easier to make a mental note (ha!) to clean this up once both appear
> upstream.

It is queued up in the xfs for-next tree.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-21  7:05         ` Christoph Hellwig
@ 2026-01-21 19:58           ` Darrick J. Wong
  0 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-01-21 19:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: cem, linux-xfs, linux-fsdevel

On Wed, Jan 21, 2026 at 08:05:56AM +0100, Christoph Hellwig wrote:
> On Tue, Jan 20, 2026 at 10:00:40AM -0800, Darrick J. Wong wrote:
> > On Tue, Jan 20, 2026 at 08:18:30AM +0100, Christoph Hellwig wrote:
> > > 
> > > > +		unsigned int	bio_bbcount;
> > > > +		blk_status_t	bio_status;
> > > > +
> > > > +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> > > > +		bio->bi_iter.bi_sector = daddr;
> > > > +		bio_add_folio_nofail(bio, folio,
> > > > +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> > > > +				0);
> > > 
> > > You could actually use bio_reuse as you implied in the previous mail here
> > > and save the bio_add_folio_nofail call.  Not really going to make much
> > > of a difference, so:
> > 
> > Hrm.  Is that bio_reuse patch queued for upstream?  Though maybe it'd be
> > easier to make a mental note (ha!) to clean this up once both appear
> > upstream.
> 
> It is queued up in the xfs for-next tree.

Ah, heh.  I'll see if cem merges the series atop his xfs-7.0-merge
branch and send a followup.  As it is I'm already going to ask Linus if
I can remove the old fsnotify error function (and any new callers that
might pop up) right before -rc1.

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-20  4:12   ` [PATCH v6.1 " Darrick J. Wong
  2026-01-20  7:18     ` Christoph Hellwig
@ 2026-02-06  3:01     ` Chris Mason
  2026-02-06  4:53       ` Darrick J. Wong
  1 sibling, 1 reply; 36+ messages in thread
From: Chris Mason @ 2026-02-06  3:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel

"Darrick J. Wong" <djwong@kernel.org> wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> verify the media of the devices backing an xfs filesystem, and have any
> resulting media errors reported to fsnotify and xfs_healer.

Hi everyone,

I'm trying out my AI review prompts on a few more trees, and I ran it
on the fuse-iomap-cache branch:

https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache

This was the latest email lore could find, apologies if I grabbed the wrong one.

Hopefully the review below is useful, please let me know if there are places it
has gone wildly wrong.  I kept the header to make it obvious exactly which
commit I'd run through.

(This is another XFS commit that ended up in my rev-list range, low confidence
in the actual review, but I'm curious if AI got this one right)

commit 2e5c40a5b6ca722202bc0e99830b6ecf8755d19c
Author: Darrick J. Wong <djwong@kernel.org>

xfs: add media verification ioctl

[ ... ]

> diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
> --- /dev/null
> +++ b/fs/xfs/xfs_verify_media.c
> @@ -0,0 +1,445 @@

[ ... ]

> +/* Walk reverse mappings to look for all file data loss */
> +static int
> +xfs_verify_report_losses(
> +	struct xfs_mount	*mp,
> +	enum xfs_group_type	type,
> +	xfs_daddr_t		daddr,
> +	u64			bblen)
> +{
> +	struct xfs_group	*xg = NULL;
> +	struct xfs_trans	*tp;
> +	xfs_fsblock_t		start_bno, end_bno;
> +	uint32_t		start_gno, end_gno;
> +	int			error;
> +
> +	if (type == XG_TYPE_RTG) {
> +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> +	} else {
> +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> +	}
> +
> +	tp = xfs_trans_alloc_empty(mp);
> +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> +		struct xfs_buf		*agf_bp = NULL;
> +		struct xfs_rtgroup	*rtg = NULL;
> +		struct xfs_btree_cur	*cur;
> +		struct xfs_rmap_irec	ri_low = { };
> +		struct xfs_rmap_irec	ri_high;
> +		struct xfs_group_data_lost lost;
> +
> +		if (type == XG_TYPE_AG) {
> +			struct xfs_perag	*pag = to_perag(xg);
> +
> +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> +			if (error) {
> +				xfs_perag_put(pag);
> +				break;
> +			}
> +
> +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> +		} else {
> +			rtg = to_rtg(xg);
> +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> +		}

Does this code have a reference counting mismatch in the error paths?

The xfs_group returned by xfs_group_next_range() is acquired via
xfs_group_grab() which increments xg_active_ref.

In the error path at line 120, the code uses xfs_perag_put() to release
the reference:

	error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
	if (error) {
		xfs_perag_put(pag);
		break;
	}

And later in the loop, line 144 uses xfs_group_put().

Both xfs_perag_put() and xfs_group_put() decrement xg_ref, not
xg_active_ref. Looking at the reference counting implementation:

xfs_group_grab() calls atomic_inc_not_zero(&xg->xg_active_ref)
xfs_group_rele() calls atomic_dec(&xg->xg_active_ref)
xfs_group_put() calls atomic_dec(&xg->xg_ref)

Should these error paths use xfs_group_rele() instead to match the
xfs_group_grab() in xfs_group_next_range()?


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-02-06  3:01     ` Chris Mason
@ 2026-02-06  4:53       ` Darrick J. Wong
  0 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-02-06  4:53 UTC (permalink / raw)
  To: Chris Mason; +Cc: cem, hch, linux-xfs, linux-fsdevel

On Thu, Feb 05, 2026 at 07:01:32PM -0800, Chris Mason wrote:
> "Darrick J. Wong" <djwong@kernel.org> wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> > verify the media of the devices backing an xfs filesystem, and have any
> > resulting media errors reported to fsnotify and xfs_healer.
> 
> Hi everyone,
> 
> I'm trying out my AI review prompts on a few more trees, and I ran it
> on the fuse-iomap-cache branch:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache
> 
> This was the latest email lore could find, apologies if I grabbed the wrong one.
> 
> Hopefully the review below is useful, please let me know if there are places it
> has gone wildly wrong.  I kept the header to make it obvious exactly which
> commit I'd run through.
> 
> (This is another XFS commit that ended up in my rev-list range, low confidence
> in the actual review, but I'm curious if AI got this one right)
> 
> commit 2e5c40a5b6ca722202bc0e99830b6ecf8755d19c
> Author: Darrick J. Wong <djwong@kernel.org>
> 
> xfs: add media verification ioctl
> 
> [ ... ]
> 
> > diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
> > --- /dev/null
> > +++ b/fs/xfs/xfs_verify_media.c
> > @@ -0,0 +1,445 @@
> 
> [ ... ]
> 
> > +/* Walk reverse mappings to look for all file data loss */
> > +static int
> > +xfs_verify_report_losses(
> > +	struct xfs_mount	*mp,
> > +	enum xfs_group_type	type,
> > +	xfs_daddr_t		daddr,
> > +	u64			bblen)
> > +{
> > +	struct xfs_group	*xg = NULL;
> > +	struct xfs_trans	*tp;
> > +	xfs_fsblock_t		start_bno, end_bno;
> > +	uint32_t		start_gno, end_gno;
> > +	int			error;
> > +
> > +	if (type == XG_TYPE_RTG) {
> > +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> > +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> > +	} else {
> > +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> > +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> > +	}
> > +
> > +	tp = xfs_trans_alloc_empty(mp);
> > +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> > +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> > +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> > +		struct xfs_buf		*agf_bp = NULL;
> > +		struct xfs_rtgroup	*rtg = NULL;
> > +		struct xfs_btree_cur	*cur;
> > +		struct xfs_rmap_irec	ri_low = { };
> > +		struct xfs_rmap_irec	ri_high;
> > +		struct xfs_group_data_lost lost;
> > +
> > +		if (type == XG_TYPE_AG) {
> > +			struct xfs_perag	*pag = to_perag(xg);
> > +
> > +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> > +			if (error) {
> > +				xfs_perag_put(pag);
> > +				break;
> > +			}
> > +
> > +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> > +		} else {
> > +			rtg = to_rtg(xg);
> > +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> > +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> > +		}
> 
> Does this code have a reference counting mismatch in the error paths?
> 
> The xfs_group returned by xfs_group_next_range() is acquired via
> xfs_group_grab() which increments xg_active_ref.
> 
> In the error path at line 120, the code uses xfs_perag_put() to release
> the reference:
> 
> 	error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> 	if (error) {
> 		xfs_perag_put(pag);
> 		break;
> 	}
> 
> And later in the loop, line 144 uses xfs_group_put().
> 
> Both xfs_perag_put() and xfs_group_put() decrement xg_ref, not
> xg_active_ref. Looking at the reference counting implementation:
> 
> xfs_group_grab() calls atomic_inc_not_zero(&xg->xg_active_ref)
> xfs_group_rele() calls atomic_dec(&xg->xg_active_ref)
> xfs_group_put() calls atomic_dec(&xg->xg_ref)
> 
> Should these error paths use xfs_group_rele() instead to match the
> xfs_group_grab() in xfs_group_next_range()?

Yep.  Wouldn't it be nice if our type system could keep those things
straight... :/

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-16  5:42 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
@ 2026-02-06 13:07   ` Pankaj Raghav (Samsung)
  2026-02-06 17:47     ` Darrick J. Wong
  0 siblings, 1 reply; 36+ messages in thread
From: Pankaj Raghav (Samsung) @ 2026-02-06 13:07 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> +
> +/* Grab a reference to the healthmon object for a given mount, if any. */
> +static struct xfs_healthmon *
> +xfs_healthmon_get(
> +	struct xfs_mount		*mp)
> +{
> +	struct xfs_healthmon		*hm;
> +
> +	rcu_read_lock();
> +	hm = mp->m_healthmon;

Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
compiler tricks that can result in an undefined behaviour? I am not sure
if I am being paranoid here.

> +	if (hm && !refcount_inc_not_zero(&hm->ref))
> +		hm = NULL;
> +	rcu_read_unlock();
> +
> +	return hm;
> +}
> +
> +/*
-- 
Pankaj

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 13:07   ` Pankaj Raghav (Samsung)
@ 2026-02-06 17:47     ` Darrick J. Wong
  2026-02-06 18:54       ` Pankaj Raghav
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-02-06 17:47 UTC (permalink / raw)
  To: Pankaj Raghav (Samsung); +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
> > +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> > +
> > +/* Grab a reference to the healthmon object for a given mount, if any. */
> > +static struct xfs_healthmon *
> > +xfs_healthmon_get(
> > +	struct xfs_mount		*mp)
> > +{
> > +	struct xfs_healthmon		*hm;
> > +
> > +	rcu_read_lock();
> > +	hm = mp->m_healthmon;
> 
> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
> compiler tricks that can result in an undefined behaviour? I am not sure
> if I am being paranoid here.

Compiler tricks?  We've taken the rcu read lock, which adds an
optimization barrier so that the mp->m_healthmon access can't be
reordered before the rcu_read_lock.  I'm not sure if that answers your
question.

<confused>

--D

> > +	if (hm && !refcount_inc_not_zero(&hm->ref))
> > +		hm = NULL;
> > +	rcu_read_unlock();
> > +
> > +	return hm;
> > +}
> > +
> > +/*
> -- 
> Pankaj
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 17:47     ` Darrick J. Wong
@ 2026-02-06 18:54       ` Pankaj Raghav
  2026-02-06 20:41         ` Darrick J. Wong
  0 siblings, 1 reply; 36+ messages in thread
From: Pankaj Raghav @ 2026-02-06 18:54 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav



On 2/6/26 18:47, Darrick J. Wong wrote:
> On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
>>> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
>>> +
>>> +/* Grab a reference to the healthmon object for a given mount, if any. */
>>> +static struct xfs_healthmon *
>>> +xfs_healthmon_get(
>>> +	struct xfs_mount		*mp)
>>> +{
>>> +	struct xfs_healthmon		*hm;
>>> +
>>> +	rcu_read_lock();
>>> +	hm = mp->m_healthmon;
>>
>> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
>> compiler tricks that can result in an undefined behaviour? I am not sure
>> if I am being paranoid here.
> 
> Compiler tricks?  We've taken the rcu read lock, which adds an
> optimization barrier so that the mp->m_healthmon access can't be
> reordered before the rcu_read_lock.  I'm not sure if that answers your
> question.
> 

This answers. So this is my understanding: RCU guarantees that we get a valid
object (actual data of m_healthmon) but does not guarantee the compiler will not reread
the pointer between checking if hm is !NULL and accessing the pointer as we are doing it
lockless.

So just a barrier() call in rcu_read_lock is enough to make sure this doesn't happen and probably
adding a READ_ONCE() is not needed?

--
Pankaj

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 18:54       ` Pankaj Raghav
@ 2026-02-06 20:41         ` Darrick J. Wong
  2026-02-09  6:34           ` Christoph Hellwig
  0 siblings, 1 reply; 36+ messages in thread
From: Darrick J. Wong @ 2026-02-06 20:41 UTC (permalink / raw)
  To: Pankaj Raghav; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 07:54:51PM +0100, Pankaj Raghav wrote:
> 
> 
> On 2/6/26 18:47, Darrick J. Wong wrote:
> > On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
> >>> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> >>> +
> >>> +/* Grab a reference to the healthmon object for a given mount, if any. */
> >>> +static struct xfs_healthmon *
> >>> +xfs_healthmon_get(
> >>> +	struct xfs_mount		*mp)
> >>> +{
> >>> +	struct xfs_healthmon		*hm;
> >>> +
> >>> +	rcu_read_lock();
> >>> +	hm = mp->m_healthmon;
> >>
> >> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
> >> compiler tricks that can result in an undefined behaviour? I am not sure
> >> if I am being paranoid here.
> > 
> > Compiler tricks?  We've taken the rcu read lock, which adds an
> > optimization barrier so that the mp->m_healthmon access can't be
> > reordered before the rcu_read_lock.  I'm not sure if that answers your
> > question.
> > 
> 
> This answers. So this is my understanding: RCU guarantees that we get
> a valid object (actual data of m_healthmon) but does not guarantee the
> compiler will not reread the pointer between checking if hm is !NULL
> and accessing the pointer as we are doing it lockless.

Oh, now I see what you're concerned about.  You're worried that the
compiler could turn this:

	if (hm && !refcount_inc_not_zero(&hm->ref))

into this:

	if (mp->m_healthmon && !refcount_inc_not_zero(&mp->m_healthmon->ref))

which then gives xfs_healthmon_detach the opening it needs to slip in
between the two dereferences of mp and turn m_healthmon into NULL,
leading the "mp->m_healthmon->ref" expression to become a NULL pointer
dereference.

> So just a barrier() call in rcu_read_lock is enough to make sure this
> doesn't happen and probably adding a READ_ONCE() is not needed?

Nope.  You're right, we do need READ_ONCE here.

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 20:41         ` Darrick J. Wong
@ 2026-02-09  6:34           ` Christoph Hellwig
  2026-02-10  4:57             ` Darrick J. Wong
  0 siblings, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2026-02-09  6:34 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Pankaj Raghav, cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 12:41:35PM -0800, Darrick J. Wong wrote:
> > So just a barrier() call in rcu_read_lock is enough to make sure this
> > doesn't happen and probably adding a READ_ONCE() is not needed?
> 
> Nope.  You're right, we do need READ_ONCE here.

The right thing is to use rcu_dereference / rcu_assign_pointer and add a
__rcu annotation to m_healthmon.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-09  6:34           ` Christoph Hellwig
@ 2026-02-10  4:57             ` Darrick J. Wong
  0 siblings, 0 replies; 36+ messages in thread
From: Darrick J. Wong @ 2026-02-10  4:57 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Pankaj Raghav, cem, linux-xfs, linux-fsdevel, p.raghav

On Mon, Feb 09, 2026 at 07:34:21AM +0100, Christoph Hellwig wrote:
> On Fri, Feb 06, 2026 at 12:41:35PM -0800, Darrick J. Wong wrote:
> > > So just a barrier() call in rcu_read_lock is enough to make sure this
> > > doesn't happen and probably adding a READ_ONCE() is not needed?
> > 
> > Nope.  You're right, we do need READ_ONCE here.
> 
> The right thing is to use rcu_dereference / rcu_assign_pointer and add a
> __rcu annotation to m_healthmon.

Noted, will change my fixpatch to do that.  Thanks for the pointer!

--D

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2026-02-10  4:57 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-16  5:42 [PATCHSET v6] xfs: autonomous self healing of filesystems Darrick J. Wong
2026-01-16  5:42 ` [PATCH 01/11] docs: discuss autonomous self healing in the xfs online repair design doc Darrick J. Wong
2026-01-16  5:42 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
2026-02-06 13:07   ` Pankaj Raghav (Samsung)
2026-02-06 17:47     ` Darrick J. Wong
2026-02-06 18:54       ` Pankaj Raghav
2026-02-06 20:41         ` Darrick J. Wong
2026-02-09  6:34           ` Christoph Hellwig
2026-02-10  4:57             ` Darrick J. Wong
2026-01-16  5:42 ` [PATCH 03/11] xfs: create event queuing, formatting, and discovery infrastructure Darrick J. Wong
2026-01-16  5:43 ` [PATCH 04/11] xfs: convey filesystem unmount events to the health monitor Darrick J. Wong
2026-01-19 15:44   ` Christoph Hellwig
2026-01-16  5:43 ` [PATCH 05/11] xfs: convey metadata health " Darrick J. Wong
2026-01-16  5:43 ` [PATCH 06/11] xfs: convey filesystem shutdown " Darrick J. Wong
2026-01-19 15:44   ` Christoph Hellwig
2026-01-16  5:43 ` [PATCH 07/11] xfs: convey externally discovered fsdax media errors " Darrick J. Wong
2026-01-16  5:44 ` [PATCH 08/11] xfs: convey file I/O " Darrick J. Wong
2026-01-16  5:44 ` [PATCH 09/11] xfs: allow toggling verbose logging on the health monitoring file Darrick J. Wong
2026-01-16  5:44 ` [PATCH 10/11] xfs: check if an open file is on the health monitored fs Darrick J. Wong
2026-01-16  5:44 ` [PATCH 11/11] xfs: add media verification ioctl Darrick J. Wong
2026-01-19 15:56   ` Christoph Hellwig
2026-01-19 17:35     ` Darrick J. Wong
2026-01-20  4:12   ` [PATCH v6.1 " Darrick J. Wong
2026-01-20  7:18     ` Christoph Hellwig
2026-01-20 18:00       ` Darrick J. Wong
2026-01-21  7:05         ` Christoph Hellwig
2026-01-21 19:58           ` Darrick J. Wong
2026-02-06  3:01     ` Chris Mason
2026-02-06  4:53       ` Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2026-01-21  6:34 [PATCHSET v7 1/3] xfs: autonomous self healing of filesystems Darrick J. Wong
2026-01-21  6:35 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
2026-01-13  0:32 [PATCHSET v5] xfs: autonomous self healing of filesystems Darrick J. Wong
2026-01-13  0:33 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
2026-01-13 16:03   ` Christoph Hellwig
2026-01-06  7:10 [PATCHSET V4] xfs: autonomous self healing of filesystems Darrick J. Wong
2026-01-06  7:11 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong
2026-01-07  9:17   ` Christoph Hellwig
2026-01-07 18:50     ` Darrick J. Wong
2026-01-08 10:21       ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox