Linux filesystem development
 help / color / mirror / Atom feed
* [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-01-21  6:34 [PATCHSET v7 1/3] xfs: autonomous self healing of filesystems Darrick J. Wong
@ 2026-01-21  6:35 ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2026-01-21  6:35 UTC (permalink / raw)
  To: djwong, cem; +Cc: hch, linux-fsdevel, linux-xfs, hch

From: Darrick J. Wong <djwong@kernel.org>

Start creating helper functions and infrastructure to pass filesystem
health events to a health monitoring file.  Since this is an
administrative interface, we only support a single health monitor
process per filesystem, so we don't need to use anything fancy such as
notifier chains (== tons of indirect calls).

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_fs.h |    7 +
 fs/xfs/xfs_healthmon.h |   36 +++++++
 fs/xfs/xfs_mount.h     |    4 +
 fs/xfs/Makefile        |    1 
 fs/xfs/xfs_health.c    |    1 
 fs/xfs/xfs_healthmon.c |  262 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_mount.c     |    2 
 8 files changed, 317 insertions(+)
 create mode 100644 fs/xfs/xfs_healthmon.h
 create mode 100644 fs/xfs/xfs_healthmon.c


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 12463ba766da05..c58e55b3df4099 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1003,6 +1003,12 @@ struct xfs_rtgroup_geometry {
 #define XFS_RTGROUP_GEOM_SICK_RMAPBT	(1U << 3)  /* reverse mappings */
 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT	(1U << 4)  /* reference counts */
 
+struct xfs_health_monitor {
+	__u64	flags;		/* flags */
+	__u8	format;		/* output format */
+	__u8	pad[23];	/* zeroes */
+};
+
 /*
  * ioctl commands that are used by Linux filesystems
  */
@@ -1042,6 +1048,7 @@ struct xfs_rtgroup_geometry {
 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
 #define XFS_IOC_SCRUBV_METADATA	_IOWR('X', 64, struct xfs_scrub_vec_head)
 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
+#define XFS_IOC_HEALTH_MONITOR	_IOW ('X', 68, struct xfs_health_monitor)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_healthmon.h b/fs/xfs/xfs_healthmon.h
new file mode 100644
index 00000000000000..218d5aac87b012
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_HEALTHMON_H__
+#define __XFS_HEALTHMON_H__
+
+struct xfs_healthmon {
+	/*
+	 * Weak reference to the xfs filesystem that is being monitored.  It
+	 * will be set to zero when the filesystem detaches from the monitor.
+	 * Do not dereference this pointer.
+	 */
+	uintptr_t			mount_cookie;
+
+	/*
+	 * Device number of the filesystem being monitored.  This is for
+	 * consistent tracing even after unmount.
+	 */
+	dev_t				dev;
+
+	/*
+	 * Reference count of this structure.  The open healthmon fd holds one
+	 * ref, the xfs_mount holds another ref if it points to this object,
+	 * and running event handlers hold their own refs.
+	 */
+	refcount_t			ref;
+};
+
+void xfs_healthmon_unmount(struct xfs_mount *mp);
+
+long xfs_ioc_health_monitor(struct file *file,
+		struct xfs_health_monitor __user *arg);
+
+#endif /* __XFS_HEALTHMON_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b871dfde372b52..61c71128d171cb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -13,6 +13,7 @@ struct xfs_ail;
 struct xfs_quotainfo;
 struct xfs_da_geometry;
 struct xfs_perag;
+struct xfs_healthmon;
 
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
@@ -342,6 +343,9 @@ typedef struct xfs_mount {
 
 	/* Hook to feed dirent updates to an active online repair. */
 	struct xfs_hooks	m_dir_update_hooks;
+
+	/* Private data referring to a health monitor object. */
+	struct xfs_healthmon	*m_healthmon;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5bf501cf827172..1b7385e23b3463 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -88,6 +88,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_globals.o \
 				   xfs_handle.o \
 				   xfs_health.o \
+				   xfs_healthmon.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index fbb8886c72fe5e..3d50397f8f7c00 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_quota_defs.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/fserror.h>
 
diff --git a/fs/xfs/xfs_healthmon.c b/fs/xfs/xfs_healthmon.c
new file mode 100644
index 00000000000000..b7095ea55897c5
--- /dev/null
+++ b/fs/xfs/xfs_healthmon.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2024-2026 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/eventpoll.h>
+#include <linux/poll.h>
+
+/*
+ * Live Health Monitoring
+ * ======================
+ *
+ * Autonomous self-healing of XFS filesystems requires a means for the kernel
+ * to send filesystem health events to a monitoring daemon in userspace.  To
+ * accomplish this, we establish a thread_with_file kthread object to handle
+ * translating internal events about filesystem health into a format that can
+ * be parsed easily by userspace.  When those internal events occur, the core
+ * filesystem code calls this health monitor to convey the events to userspace.
+ * Userspace reads events from the file descriptor returned by the ioctl.
+ *
+ * The healthmon abstraction has a weak reference to the host filesystem mount
+ * so that the queueing and processing of the events do not pin the mount and
+ * cannot slow down the main filesystem.  The healthmon object can exist past
+ * the end of the filesystem mount.
+ */
+
+/* sign of a detached health monitor */
+#define DETACHED_MOUNT_COOKIE		((uintptr_t)0)
+
+/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
+static DEFINE_SPINLOCK(xfs_healthmon_lock);
+
+/* Grab a reference to the healthmon object for a given mount, if any. */
+static struct xfs_healthmon *
+xfs_healthmon_get(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm;
+
+	rcu_read_lock();
+	hm = mp->m_healthmon;
+	if (hm && !refcount_inc_not_zero(&hm->ref))
+		hm = NULL;
+	rcu_read_unlock();
+
+	return hm;
+}
+
+/*
+ * Release the reference to a healthmon object.  If there are no more holders,
+ * free the health monitor after an RCU grace period to eliminate possibility
+ * of races with xfs_healthmon_get.
+ */
+static void
+xfs_healthmon_put(
+	struct xfs_healthmon		*hm)
+{
+	if (refcount_dec_and_test(&hm->ref))
+		kfree_rcu_mightsleep(hm);
+}
+
+/* Attach a health monitor to an xfs_mount.  Only one allowed at a time. */
+STATIC int
+xfs_healthmon_attach(
+	struct xfs_mount	*mp,
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (mp->m_healthmon != NULL) {
+		spin_unlock(&xfs_healthmon_lock);
+		return -EEXIST;
+	}
+
+	refcount_inc(&hm->ref);
+	mp->m_healthmon = hm;
+	hm->mount_cookie = (uintptr_t)mp->m_super;
+	spin_unlock(&xfs_healthmon_lock);
+
+	return 0;
+}
+
+/* Detach a xfs mount from a specific healthmon instance. */
+STATIC void
+xfs_healthmon_detach(
+	struct xfs_healthmon	*hm)
+{
+	spin_lock(&xfs_healthmon_lock);
+	if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
+		spin_unlock(&xfs_healthmon_lock);
+		return;
+	}
+
+	XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
+	hm->mount_cookie = DETACHED_MOUNT_COOKIE;
+	spin_unlock(&xfs_healthmon_lock);
+
+	xfs_healthmon_put(hm);
+}
+
+/* Detach the xfs mount from this healthmon instance. */
+void
+xfs_healthmon_unmount(
+	struct xfs_mount		*mp)
+{
+	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);
+
+	if (!hm)
+		return;
+
+	xfs_healthmon_detach(hm);
+	xfs_healthmon_put(hm);
+}
+
+STATIC ssize_t
+xfs_healthmon_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	return -EIO;
+}
+
+/* Free the health monitoring information. */
+STATIC int
+xfs_healthmon_release(
+	struct inode		*inode,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	/*
+	 * We might be closing the healthmon file before the filesystem
+	 * unmounts, because userspace processes can terminate at any time and
+	 * for any reason.  Null out xfs_mount::m_healthmon so that another
+	 * process can create another health monitor file.
+	 */
+	xfs_healthmon_detach(hm);
+
+	xfs_healthmon_put(hm);
+	return 0;
+}
+
+/* Validate ioctl parameters. */
+static inline bool
+xfs_healthmon_validate(
+	const struct xfs_health_monitor	*hmo)
+{
+	if (hmo->flags)
+		return false;
+	if (hmo->format)
+		return false;
+	if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
+		return false;
+	return true;
+}
+
+/* Emit some data about the health monitoring fd. */
+static void
+xfs_healthmon_show_fdinfo(
+	struct seq_file		*m,
+	struct file		*file)
+{
+	struct xfs_healthmon	*hm = file->private_data;
+
+	seq_printf(m, "state:\t%s\ndev:\t%d:%d\n",
+			hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
+				"dead" : "alive",
+			MAJOR(hm->dev), MINOR(hm->dev));
+}
+
+static const struct file_operations xfs_healthmon_fops = {
+	.owner		= THIS_MODULE,
+	.show_fdinfo	= xfs_healthmon_show_fdinfo,
+	.read_iter	= xfs_healthmon_read_iter,
+	.release	= xfs_healthmon_release,
+};
+
+/*
+ * Create a health monitoring file.  Returns an index to the fd table or a
+ * negative errno.
+ */
+long
+xfs_ioc_health_monitor(
+	struct file			*file,
+	struct xfs_health_monitor __user *arg)
+{
+	struct xfs_health_monitor	hmo;
+	struct xfs_healthmon		*hm;
+	struct xfs_inode		*ip = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip->i_mount;
+	int				ret;
+
+	/*
+	 * The only intended user of the health monitoring system should be the
+	 * xfs_healer daemon running on behalf of the whole filesystem in the
+	 * initial user namespace.  IOWs, we don't allow unprivileged userspace
+	 * (they can use fsnotify) nor do we allow containers.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (ip->i_ino != mp->m_sb.sb_rootino)
+		return -EPERM;
+	if (current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (copy_from_user(&hmo, arg, sizeof(hmo)))
+		return -EFAULT;
+
+	if (!xfs_healthmon_validate(&hmo))
+		return -EINVAL;
+
+	hm = kzalloc(sizeof(*hm), GFP_KERNEL);
+	if (!hm)
+		return -ENOMEM;
+	hm->dev = mp->m_super->s_dev;
+	refcount_set(&hm->ref, 1);
+
+	/*
+	 * Try to attach this health monitor to the xfs_mount.  The monitor is
+	 * considered live and will receive events if this succeeds.
+	 */
+	ret = xfs_healthmon_attach(mp, hm);
+	if (ret)
+		goto out_hm;
+
+	/*
+	 * Create the anonymous file and install a fd for it.  If it succeeds,
+	 * the file owns hm and can go away at any time, so we must not access
+	 * it again.  This must go last because we can't undo a fd table
+	 * installation.
+	 */
+	ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
+			O_CLOEXEC | O_RDONLY);
+	if (ret < 0)
+		goto out_mp;
+
+	return ret;
+
+out_mp:
+	xfs_healthmon_detach(hm);
+out_hm:
+	ASSERT(refcount_read(&hm->ref) == 1);
+	xfs_healthmon_put(hm);
+	return ret;
+}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 59eaad77437181..c04c41ca924e37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,7 @@
 #include "xfs_exchrange.h"
 #include "xfs_handle.h"
 #include "xfs_rtgroup.h"
+#include "xfs_healthmon.h"
 
 #include <linux/mount.h>
 #include <linux/fileattr.h>
@@ -1419,6 +1420,9 @@ xfs_file_ioctl(
 	case XFS_IOC_COMMIT_RANGE:
 		return xfs_ioc_commit_range(filp, arg);
 
+	case XFS_IOC_HEALTH_MONITOR:
+		return xfs_ioc_health_monitor(filp, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0953f6ae94abc8..ab67c91915384c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
 #include "xfs_zone_alloc.h"
+#include "xfs_healthmon.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -625,6 +626,7 @@ xfs_unmount_flush_inodes(
 	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp);
 	xfs_health_unmount(mp);
+	xfs_healthmon_unmount(mp);
 }
 
 static void


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
       [not found]       ` <20260120180040.GU15551@frogsfrogsfrogs>
@ 2026-01-21  7:05         ` Christoph Hellwig
  2026-01-21 19:58           ` Darrick J. Wong
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2026-01-21  7:05 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, cem, linux-xfs, linux-fsdevel

On Tue, Jan 20, 2026 at 10:00:40AM -0800, Darrick J. Wong wrote:
> On Tue, Jan 20, 2026 at 08:18:30AM +0100, Christoph Hellwig wrote:
> > 
> > > +		unsigned int	bio_bbcount;
> > > +		blk_status_t	bio_status;
> > > +
> > > +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> > > +		bio->bi_iter.bi_sector = daddr;
> > > +		bio_add_folio_nofail(bio, folio,
> > > +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> > > +				0);
> > 
> > You could actually use bio_reuse as you implied in the previous mail here
> > and save the bio_add_folio_nofail call.  Not really going to make much
> > of a difference, so:
> 
> Hrm.  Is that bio_reuse patch queued for upstream?  Though maybe it'd be
> easier to make a mental note (ha!) to clean this up once both appear
> upstream.

It is queued up in the xfs for-next tree.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-01-21  7:05         ` [PATCH v6.1 11/11] xfs: add media verification ioctl Christoph Hellwig
@ 2026-01-21 19:58           ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2026-01-21 19:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: cem, linux-xfs, linux-fsdevel

On Wed, Jan 21, 2026 at 08:05:56AM +0100, Christoph Hellwig wrote:
> On Tue, Jan 20, 2026 at 10:00:40AM -0800, Darrick J. Wong wrote:
> > On Tue, Jan 20, 2026 at 08:18:30AM +0100, Christoph Hellwig wrote:
> > > 
> > > > +		unsigned int	bio_bbcount;
> > > > +		blk_status_t	bio_status;
> > > > +
> > > > +		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
> > > > +		bio->bi_iter.bi_sector = daddr;
> > > > +		bio_add_folio_nofail(bio, folio,
> > > > +				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
> > > > +				0);
> > > 
> > > You could actually use bio_reuse as you implied in the previous mail here
> > > and save the bio_add_folio_nofail call.  Not really going to make much
> > > of a difference, so:
> > 
> > Hrm.  Is that bio_reuse patch queued for upstream?  Though maybe it'd be
> > easier to make a mental note (ha!) to clean this up once both appear
> > upstream.
> 
> It is queued up in the xfs for-next tree.

Ah, heh.  I'll see if cem merges the series atop his xfs-7.0-merge
branch and send a followup.  As it is I'm already going to ask Linus if
I can remove the old fsnotify error function (and any new callers that
might pop up) right before -rc1.

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
       [not found]   ` <20260120041226.GJ15551@frogsfrogsfrogs>
       [not found]     ` <20260120071830.GA5686@lst.de>
@ 2026-02-06  3:01     ` Chris Mason
  2026-02-06  4:53       ` Darrick J. Wong
  1 sibling, 1 reply; 11+ messages in thread
From: Chris Mason @ 2026-02-06  3:01 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel

"Darrick J. Wong" <djwong@kernel.org> wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> verify the media of the devices backing an xfs filesystem, and have any
> resulting media errors reported to fsnotify and xfs_healer.

Hi everyone,

I'm trying out my AI review prompts on a few more trees, and I ran it
on the fuse-iomap-cache branch:

https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache

This was the latest email lore could find, apologies if I grabbed the wrong one.

Hopefully the review below is useful, please let me know if there are places it
has gone wildly wrong.  I kept the header to make it obvious exactly which
commit I'd run through.

(This is another XFS commit that ended up in my rev-list range, low confidence
in the actual review, but I'm curious if AI got this one right)

commit 2e5c40a5b6ca722202bc0e99830b6ecf8755d19c
Author: Darrick J. Wong <djwong@kernel.org>

xfs: add media verification ioctl

[ ... ]

> diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
> --- /dev/null
> +++ b/fs/xfs/xfs_verify_media.c
> @@ -0,0 +1,445 @@

[ ... ]

> +/* Walk reverse mappings to look for all file data loss */
> +static int
> +xfs_verify_report_losses(
> +	struct xfs_mount	*mp,
> +	enum xfs_group_type	type,
> +	xfs_daddr_t		daddr,
> +	u64			bblen)
> +{
> +	struct xfs_group	*xg = NULL;
> +	struct xfs_trans	*tp;
> +	xfs_fsblock_t		start_bno, end_bno;
> +	uint32_t		start_gno, end_gno;
> +	int			error;
> +
> +	if (type == XG_TYPE_RTG) {
> +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> +	} else {
> +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> +	}
> +
> +	tp = xfs_trans_alloc_empty(mp);
> +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> +		struct xfs_buf		*agf_bp = NULL;
> +		struct xfs_rtgroup	*rtg = NULL;
> +		struct xfs_btree_cur	*cur;
> +		struct xfs_rmap_irec	ri_low = { };
> +		struct xfs_rmap_irec	ri_high;
> +		struct xfs_group_data_lost lost;
> +
> +		if (type == XG_TYPE_AG) {
> +			struct xfs_perag	*pag = to_perag(xg);
> +
> +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> +			if (error) {
> +				xfs_perag_put(pag);
> +				break;
> +			}
> +
> +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> +		} else {
> +			rtg = to_rtg(xg);
> +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> +		}

Does this code have a reference counting mismatch in the error paths?

The xfs_group returned by xfs_group_next_range() is acquired via
xfs_group_grab() which increments xg_active_ref.

In the error path at line 120, the code uses xfs_perag_put() to release
the reference:

	error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
	if (error) {
		xfs_perag_put(pag);
		break;
	}

And later in the loop, line 144 uses xfs_group_put().

Both xfs_perag_put() and xfs_group_put() decrement xg_ref, not
xg_active_ref. Looking at the reference counting implementation:

xfs_group_grab() calls atomic_inc_not_zero(&xg->xg_active_ref)
xfs_group_rele() calls atomic_dec(&xg->xg_active_ref)
xfs_group_put() calls atomic_dec(&xg->xg_ref)

Should these error paths use xfs_group_rele() instead to match the
xfs_group_grab() in xfs_group_next_range()?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v6.1 11/11] xfs: add media verification ioctl
  2026-02-06  3:01     ` Chris Mason
@ 2026-02-06  4:53       ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2026-02-06  4:53 UTC (permalink / raw)
  To: Chris Mason; +Cc: cem, hch, linux-xfs, linux-fsdevel

On Thu, Feb 05, 2026 at 07:01:32PM -0800, Chris Mason wrote:
> "Darrick J. Wong" <djwong@kernel.org> wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Add a new privileged ioctl so that xfs_scrub can ask the kernel to
> > verify the media of the devices backing an xfs filesystem, and have any
> > resulting media errors reported to fsnotify and xfs_healer.
> 
> Hi everyone,
> 
> I'm trying out my AI review prompts on a few more trees, and I ran it
> on the fuse-iomap-cache branch:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-cache
> 
> This was the latest email lore could find, apologies if I grabbed the wrong one.
> 
> Hopefully the review below is useful, please let me know if there are places it
> has gone wildly wrong.  I kept the header to make it obvious exactly which
> commit I'd run through.
> 
> (This is another XFS commit that ended up in my rev-list range, low confidence
> in the actual review, but I'm curious if AI got this one right)
> 
> commit 2e5c40a5b6ca722202bc0e99830b6ecf8755d19c
> Author: Darrick J. Wong <djwong@kernel.org>
> 
> xfs: add media verification ioctl
> 
> [ ... ]
> 
> > diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c
> > --- /dev/null
> > +++ b/fs/xfs/xfs_verify_media.c
> > @@ -0,0 +1,445 @@
> 
> [ ... ]
> 
> > +/* Walk reverse mappings to look for all file data loss */
> > +static int
> > +xfs_verify_report_losses(
> > +	struct xfs_mount	*mp,
> > +	enum xfs_group_type	type,
> > +	xfs_daddr_t		daddr,
> > +	u64			bblen)
> > +{
> > +	struct xfs_group	*xg = NULL;
> > +	struct xfs_trans	*tp;
> > +	xfs_fsblock_t		start_bno, end_bno;
> > +	uint32_t		start_gno, end_gno;
> > +	int			error;
> > +
> > +	if (type == XG_TYPE_RTG) {
> > +		start_bno = xfs_daddr_to_rtb(mp, daddr);
> > +		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
> > +	} else {
> > +		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
> > +		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
> > +	}
> > +
> > +	tp = xfs_trans_alloc_empty(mp);
> > +	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
> > +	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
> > +	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
> > +		struct xfs_buf		*agf_bp = NULL;
> > +		struct xfs_rtgroup	*rtg = NULL;
> > +		struct xfs_btree_cur	*cur;
> > +		struct xfs_rmap_irec	ri_low = { };
> > +		struct xfs_rmap_irec	ri_high;
> > +		struct xfs_group_data_lost lost;
> > +
> > +		if (type == XG_TYPE_AG) {
> > +			struct xfs_perag	*pag = to_perag(xg);
> > +
> > +			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> > +			if (error) {
> > +				xfs_perag_put(pag);
> > +				break;
> > +			}
> > +
> > +			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
> > +		} else {
> > +			rtg = to_rtg(xg);
> > +			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> > +			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> > +		}
> 
> Does this code have a reference counting mismatch in the error paths?
> 
> The xfs_group returned by xfs_group_next_range() is acquired via
> xfs_group_grab() which increments xg_active_ref.
> 
> In the error path at line 120, the code uses xfs_perag_put() to release
> the reference:
> 
> 	error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
> 	if (error) {
> 		xfs_perag_put(pag);
> 		break;
> 	}
> 
> And later in the loop, line 144 uses xfs_group_put().
> 
> Both xfs_perag_put() and xfs_group_put() decrement xg_ref, not
> xg_active_ref. Looking at the reference counting implementation:
> 
> xfs_group_grab() calls atomic_inc_not_zero(&xg->xg_active_ref)
> xfs_group_rele() calls atomic_dec(&xg->xg_active_ref)
> xfs_group_put() calls atomic_dec(&xg->xg_ref)
> 
> Should these error paths use xfs_group_rele() instead to match the
> xfs_group_grab() in xfs_group_next_range()?

Yep.  Wouldn't it be nice if our type system could keep those things
straight... :/

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
       [not found] ` <176852588582.2137143.1283636639551788931.stgit@frogsfrogsfrogs>
@ 2026-02-06 13:07   ` Pankaj Raghav (Samsung)
  2026-02-06 17:47     ` Darrick J. Wong
  0 siblings, 1 reply; 11+ messages in thread
From: Pankaj Raghav (Samsung) @ 2026-02-06 13:07 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> +
> +/* Grab a reference to the healthmon object for a given mount, if any. */
> +static struct xfs_healthmon *
> +xfs_healthmon_get(
> +	struct xfs_mount		*mp)
> +{
> +	struct xfs_healthmon		*hm;
> +
> +	rcu_read_lock();
> +	hm = mp->m_healthmon;

Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
compiler tricks that can result in an undefined behaviour? I am not sure
if I am being paranoid here.

> +	if (hm && !refcount_inc_not_zero(&hm->ref))
> +		hm = NULL;
> +	rcu_read_unlock();
> +
> +	return hm;
> +}
> +
> +/*
-- 
Pankaj

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 13:07   ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Pankaj Raghav (Samsung)
@ 2026-02-06 17:47     ` Darrick J. Wong
  2026-02-06 18:54       ` Pankaj Raghav
  0 siblings, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2026-02-06 17:47 UTC (permalink / raw)
  To: Pankaj Raghav (Samsung); +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
> > +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> > +
> > +/* Grab a reference to the healthmon object for a given mount, if any. */
> > +static struct xfs_healthmon *
> > +xfs_healthmon_get(
> > +	struct xfs_mount		*mp)
> > +{
> > +	struct xfs_healthmon		*hm;
> > +
> > +	rcu_read_lock();
> > +	hm = mp->m_healthmon;
> 
> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
> compiler tricks that can result in an undefined behaviour? I am not sure
> if I am being paranoid here.

Compiler tricks?  We've taken the rcu read lock, which adds an
optimization barrier so that the mp->m_healthmon access can't be
reordered before the rcu_read_lock.  I'm not sure if that answers your
question.

<confused>

--D

> > +	if (hm && !refcount_inc_not_zero(&hm->ref))
> > +		hm = NULL;
> > +	rcu_read_unlock();
> > +
> > +	return hm;
> > +}
> > +
> > +/*
> -- 
> Pankaj
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 17:47     ` Darrick J. Wong
@ 2026-02-06 18:54       ` Pankaj Raghav
  2026-02-06 20:41         ` Darrick J. Wong
  0 siblings, 1 reply; 11+ messages in thread
From: Pankaj Raghav @ 2026-02-06 18:54 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav



On 2/6/26 18:47, Darrick J. Wong wrote:
> On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
>>> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
>>> +
>>> +/* Grab a reference to the healthmon object for a given mount, if any. */
>>> +static struct xfs_healthmon *
>>> +xfs_healthmon_get(
>>> +	struct xfs_mount		*mp)
>>> +{
>>> +	struct xfs_healthmon		*hm;
>>> +
>>> +	rcu_read_lock();
>>> +	hm = mp->m_healthmon;
>>
>> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
>> compiler tricks that can result in an undefined behaviour? I am not sure
>> if I am being paranoid here.
> 
> Compiler tricks?  We've taken the rcu read lock, which adds an
> optimization barrier so that the mp->m_healthmon access can't be
> reordered before the rcu_read_lock.  I'm not sure if that answers your
> question.
> 

This answers. So this is my understanding: RCU guarantees that we get a valid
object (actual data of m_healthmon) but does not guarantee the compiler will not reread
the pointer between checking if hm is !NULL and accessing the pointer as we are doing it
lockless.

So just a barrier() call in rcu_read_lock is enough to make sure this doesn't happen and probably
adding a READ_ONCE() is not needed?

--
Pankaj

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 18:54       ` Pankaj Raghav
@ 2026-02-06 20:41         ` Darrick J. Wong
  2026-02-09  6:34           ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2026-02-06 20:41 UTC (permalink / raw)
  To: Pankaj Raghav; +Cc: cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 07:54:51PM +0100, Pankaj Raghav wrote:
> 
> 
> On 2/6/26 18:47, Darrick J. Wong wrote:
> > On Fri, Feb 06, 2026 at 02:07:56PM +0100, Pankaj Raghav (Samsung) wrote:
> >>> +static DEFINE_SPINLOCK(xfs_healthmon_lock);
> >>> +
> >>> +/* Grab a reference to the healthmon object for a given mount, if any. */
> >>> +static struct xfs_healthmon *
> >>> +xfs_healthmon_get(
> >>> +	struct xfs_mount		*mp)
> >>> +{
> >>> +	struct xfs_healthmon		*hm;
> >>> +
> >>> +	rcu_read_lock();
> >>> +	hm = mp->m_healthmon;
> >>
> >> Nit: Should we do a READ_ONCE(mp->m_healthmon) here to avoid any
> >> compiler tricks that can result in an undefined behaviour? I am not sure
> >> if I am being paranoid here.
> > 
> > Compiler tricks?  We've taken the rcu read lock, which adds an
> > optimization barrier so that the mp->m_healthmon access can't be
> > reordered before the rcu_read_lock.  I'm not sure if that answers your
> > question.
> > 
> 
> This answers. So this is my understanding: RCU guarantees that we get
> a valid object (actual data of m_healthmon) but does not guarantee the
> compiler will not reread the pointer between checking if hm is !NULL
> and accessing the pointer as we are doing it lockless.

Oh, now I see what you're concerned about.  You're worried that the
compiler could turn this:

	if (hm && !refcount_inc_not_zero(&hm->ref))

into this:

	if (mp->m_healthmon && !refcount_inc_not_zero(&mp->m_healthmon->ref))

which then gives xfs_healthmon_detach the opening it needs to slip in
between the two dereferences of mp and turn m_healthmon into NULL,
leading the "mp->m_healthmon->ref" expression to become a NULL pointer
dereference.

> So just a barrier() call in rcu_read_lock is enough to make sure this
> doesn't happen and probably adding a READ_ONCE() is not needed?

Nope.  You're right, we do need READ_ONCE here.

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-06 20:41         ` Darrick J. Wong
@ 2026-02-09  6:34           ` Christoph Hellwig
  2026-02-10  4:57             ` Darrick J. Wong
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2026-02-09  6:34 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Pankaj Raghav, cem, hch, linux-xfs, linux-fsdevel, p.raghav

On Fri, Feb 06, 2026 at 12:41:35PM -0800, Darrick J. Wong wrote:
> > So just a barrier() call in rcu_read_lock is enough to make sure this
> > doesn't happen and probably adding a READ_ONCE() is not needed?
> 
> Nope.  You're right, we do need READ_ONCE here.

The right thing is to use rcu_dereference / rcu_assign_pointer and add a
__rcu annotation to m_healthmon.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 02/11] xfs: start creating infrastructure for health monitoring
  2026-02-09  6:34           ` Christoph Hellwig
@ 2026-02-10  4:57             ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2026-02-10  4:57 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Pankaj Raghav, cem, linux-xfs, linux-fsdevel, p.raghav

On Mon, Feb 09, 2026 at 07:34:21AM +0100, Christoph Hellwig wrote:
> On Fri, Feb 06, 2026 at 12:41:35PM -0800, Darrick J. Wong wrote:
> > > So just a barrier() call in rcu_read_lock is enough to make sure this
> > > doesn't happen and probably adding a READ_ONCE() is not needed?
> > 
> > Nope.  You're right, we do need READ_ONCE here.
> 
> The right thing is to use rcu_dereference / rcu_assign_pointer and add a
> __rcu annotation to m_healthmon.

Noted, will change my fixpatch to do that.  Thanks for the pointer!

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-02-10  4:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <176852588473.2137143.1604994842772101197.stgit@frogsfrogsfrogs>
     [not found] ` <176852588776.2137143.7103003682733018282.stgit@frogsfrogsfrogs>
     [not found]   ` <20260120041226.GJ15551@frogsfrogsfrogs>
     [not found]     ` <20260120071830.GA5686@lst.de>
     [not found]       ` <20260120180040.GU15551@frogsfrogsfrogs>
2026-01-21  7:05         ` [PATCH v6.1 11/11] xfs: add media verification ioctl Christoph Hellwig
2026-01-21 19:58           ` Darrick J. Wong
2026-02-06  3:01     ` Chris Mason
2026-02-06  4:53       ` Darrick J. Wong
     [not found] ` <176852588582.2137143.1283636639551788931.stgit@frogsfrogsfrogs>
2026-02-06 13:07   ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Pankaj Raghav (Samsung)
2026-02-06 17:47     ` Darrick J. Wong
2026-02-06 18:54       ` Pankaj Raghav
2026-02-06 20:41         ` Darrick J. Wong
2026-02-09  6:34           ` Christoph Hellwig
2026-02-10  4:57             ` Darrick J. Wong
2026-01-21  6:34 [PATCHSET v7 1/3] xfs: autonomous self healing of filesystems Darrick J. Wong
2026-01-21  6:35 ` [PATCH 02/11] xfs: start creating infrastructure for health monitoring Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox