* [PATCH 1/7] xfs_healer: fix error reporting
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
@ 2026-06-02 4:58 ` Darrick J. Wong
2026-06-02 6:31 ` Christoph Hellwig
2026-06-02 4:58 ` [PATCH 2/7] xfs_healer: recommend offline fsck for XCORRUPT repairs Darrick J. Wong
` (5 subsequent siblings)
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:58 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Codex complains about incorrect error reporting logic here -- in the
first place we totally fail to pick up errno; in the second, we do, but
with inverted sign. Fix both.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: b9b03770197d19 ("xfs_healer: create daemon to listen for health events")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/xfs_healer.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/healer/xfs_healer.c b/healer/xfs_healer.c
index 36e52a71bebfa7..4ea8d86ac9114a 100644
--- a/healer/xfs_healer.c
+++ b/healer/xfs_healer.c
@@ -453,7 +453,7 @@ setup_monitor(
* the kernel as quickly as we can. Note that the kernel won't accrue
* more than 32K of internal events before it starts dropping them.
*/
- ret = workqueue_create_bound(&ctx->event_queue, ctx, healer_nproc(ctx),
+ ret = -workqueue_create_bound(&ctx->event_queue, ctx, healer_nproc(ctx),
1048576 / sizeof(struct xfs_health_monitor_event));
if (ret) {
errno = ret;
@@ -503,10 +503,12 @@ monitor(
nr = fread(hme, sizeof(*hme), 1, ctx->mon_fp);
if (ferror(ctx->mon_fp)) {
+ int error = errno;
+
pthread_mutex_lock(&ctx->conlock);
fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
_("error reading event file"),
- strerror(ret));
+ strerror(error));
pthread_mutex_unlock(&ctx->conlock);
free(hme);
ret = -1;
@@ -521,7 +523,7 @@ monitor(
mounted = false;
/* handle_event owns hme if the workqueue_add succeeds */
- ret = workqueue_add(&ctx->event_queue, handle_event, 0, hme);
+ ret = -workqueue_add(&ctx->event_queue, handle_event, 0, hme);
if (ret) {
pthread_mutex_lock(&ctx->conlock);
fprintf(stderr, "%s: %s: %s\n", ctx->mntpoint,
@@ -529,6 +531,7 @@ monitor(
strerror(ret));
pthread_mutex_unlock(&ctx->conlock);
free(hme);
+ ret = -1;
break;
}
} while (nr > 0 && mounted);
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 2/7] xfs_healer: recommend offline fsck for XCORRUPT repairs
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
2026-06-02 4:58 ` [PATCH 1/7] xfs_healer: fix error reporting Darrick J. Wong
@ 2026-06-02 4:58 ` Darrick J. Wong
2026-06-02 6:31 ` Christoph Hellwig
2026-06-02 4:58 ` [PATCH 3/7] xfs_healer: initialize variable here Darrick J. Wong
` (4 subsequent siblings)
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:58 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
If the kernel thinks the repair succeeded but the re-scrub cannot
complete the cross-referencing checks, we should recommend an offline
repair instead of reporting the repair as completely successful or
unnecessary. Codex noticed the omitted flag check here.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: 6f0f35a87c8797 ("xfs_healer: enable repairing filesystems")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/fsrepair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 002e5e78fcf22e..3f9bd39de9cec6 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -23,7 +23,7 @@ static enum repair_outcome from_repair_oflags(uint32_t oflags)
if (oflags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_INCOMPLETE))
return REPAIR_FAILED;
- if (oflags & XFS_SCRUB_OFLAG_XFAIL)
+ if (oflags & (XFS_SCRUB_OFLAG_XFAIL | XFS_SCRUB_OFLAG_XCORRUPT))
return REPAIR_PROBABLY_OK;
if (oflags & XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 3/7] xfs_healer: initialize variable here
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
2026-06-02 4:58 ` [PATCH 1/7] xfs_healer: fix error reporting Darrick J. Wong
2026-06-02 4:58 ` [PATCH 2/7] xfs_healer: recommend offline fsck for XCORRUPT repairs Darrick J. Wong
@ 2026-06-02 4:58 ` Darrick J. Wong
2026-06-02 6:32 ` Christoph Hellwig
2026-06-02 4:59 ` [PATCH 4/7] xfs_healer: run a full xfs_scrub repair if we don't know how to do a spot repair Darrick J. Wong
` (3 subsequent siblings)
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:58 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Initialize the event prefix to zeroes so that we don't read bogus stack
data if the filesystem doesn't support parent pointers. Found by Codex.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: 17d840245c0e20 ("xfs_healer: use getparents to look up file names")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/fsrepair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 3f9bd39de9cec6..7ad513b2d69b93 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -203,7 +203,7 @@ try_repair_inode(
{0, 0},
};
#undef X
- struct hme_prefix new_pfx;
+ struct hme_prefix new_pfx = { };
const struct hme_prefix *pfx = orig_pfx;
const struct u32_scrub *f;
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 4/7] xfs_healer: run a full xfs_scrub repair if we don't know how to do a spot repair
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
` (2 preceding siblings ...)
2026-06-02 4:58 ` [PATCH 3/7] xfs_healer: initialize variable here Darrick J. Wong
@ 2026-06-02 4:59 ` Darrick J. Wong
2026-06-02 6:32 ` Christoph Hellwig
2026-06-02 4:59 ` [PATCH 5/7] xfs_healer: fix Makefile errors Darrick J. Wong
` (2 subsequent siblings)
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:59 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Codex noticed that there are a few sickness event flags that we don't
explicitly handle in fsrepair.c. These three codes
(XFS_FSOP_GEOM_SICK_METADIR, XFS_FSOP_GEOM_SICK_METAPATH, and
XFS_AG_GEOM_SICK_INODES) can't be handled from within xfs_healer, so we
should trigger a full xfs_scrub run to try to fix those problems.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: a162dc462186a6 ("xfs_healer: run full scrub after lost corruption events or targeted repair failure")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/fsrepair.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/healer/fsrepair.c b/healer/fsrepair.c
index 7ad513b2d69b93..7da9e562d68734 100644
--- a/healer/fsrepair.c
+++ b/healer/fsrepair.c
@@ -88,6 +88,7 @@ try_repair_wholefs(
};
#undef X
const struct u32_scrub *f;
+ uint32_t needrepair = hme->e.fs.mask;
foreach_scrub_type(f, hme->e.fs.mask, FS_STRUCTURES) {
enum repair_outcome outcome =
@@ -99,7 +100,11 @@ try_repair_wholefs(
if (outcome == REPAIR_FAILED)
return NEED_FULL_REPAIR;
+
+ needrepair &= ~f->event_mask;
}
+ if (needrepair)
+ return NEED_FULL_REPAIR;
return REPAIR_DONE;
}
@@ -127,7 +132,8 @@ try_repair_ag(
{0, 0},
};
#undef X
- const struct u32_scrub *f;
+ const struct u32_scrub *f;
+ uint32_t needrepair = hme->e.group.mask;
foreach_scrub_type(f, hme->e.group.mask, AG_STRUCTURES) {
enum repair_outcome outcome =
@@ -140,7 +146,11 @@ try_repair_ag(
if (outcome == REPAIR_FAILED)
return NEED_FULL_REPAIR;
+
+ needrepair &= ~f->event_mask;
}
+ if (needrepair)
+ return NEED_FULL_REPAIR;
return REPAIR_DONE;
}
@@ -163,7 +173,8 @@ try_repair_rtgroup(
{0, 0},
};
#undef X
- const struct u32_scrub *f;
+ const struct u32_scrub *f;
+ uint32_t needrepair = hme->e.group.mask;
foreach_scrub_type(f, hme->e.group.mask, RTG_STRUCTURES) {
enum repair_outcome outcome =
@@ -176,7 +187,11 @@ try_repair_rtgroup(
if (outcome == REPAIR_FAILED)
return NEED_FULL_REPAIR;
+
+ needrepair &= ~f->event_mask;
}
+ if (needrepair)
+ return NEED_FULL_REPAIR;
return REPAIR_DONE;
}
@@ -206,6 +221,7 @@ try_repair_inode(
struct hme_prefix new_pfx = { };
const struct hme_prefix *pfx = orig_pfx;
const struct u32_scrub *f;
+ uint32_t needrepair = hme->e.inode.mask;
foreach_scrub_type(f, hme->e.inode.mask, INODE_STRUCTURES) {
enum repair_outcome outcome =
@@ -228,7 +244,11 @@ try_repair_inode(
if (outcome == REPAIR_FAILED)
return NEED_FULL_REPAIR;
+
+ needrepair &= ~f->event_mask;
}
+ if (needrepair)
+ return NEED_FULL_REPAIR;
return REPAIR_DONE;
}
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 5/7] xfs_healer: fix Makefile errors
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
` (3 preceding siblings ...)
2026-06-02 4:59 ` [PATCH 4/7] xfs_healer: run a full xfs_scrub repair if we don't know how to do a spot repair Darrick J. Wong
@ 2026-06-02 4:59 ` Darrick J. Wong
2026-06-02 6:32 ` Christoph Hellwig
2026-06-02 4:59 ` [PATCH 6/7] xfs_healer: don't leak wh->mntpoint if fd_to_handle fails Darrick J. Wong
2026-06-02 4:59 ` [PATCH 7/7] xfs_healer: coordinate access to weakhandle::mntpoint correctly Darrick J. Wong
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:59 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Codex had a couple of complaints about the makefile -- mostly that it
never cleans up after xfs_healer_start; and that it should use LTINSTALL
to install libtool binaries.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/Makefile | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/healer/Makefile b/healer/Makefile
index 1db2872617241b..49a48b52782e72 100644
--- a/healer/Makefile
+++ b/healer/Makefile
@@ -42,9 +42,12 @@ endif # HAVE_SYSTEMD
ifeq ($(HAVE_HEALER_START_DEPS),yes)
BUILD_TARGETS += xfs_healer_start
+LDIRT += xfs_healer_start xfs_healer_start.o
SYSTEMD_SERVICES += xfs_healer_start.service
endif # xfs_healer_start deps
+LDIRT += $(SYSTEMD_SERVICES)
+
default: depend $(BUILD_TARGETS) $(SYSTEMD_SERVICES)
xfs_healer_start: $(SUBDIRS) xfs_healer_start.o $(LTDEPENDENCIES)
@@ -62,7 +65,7 @@ install: $(INSTALL_HEALER)
install-healer: default
$(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
- $(INSTALL) -m 755 $(BUILD_TARGETS) $(PKG_LIBEXEC_DIR)
+ $(LTINSTALL) -m 755 $(BUILD_TARGETS) $(PKG_LIBEXEC_DIR)
install-systemd: default
$(INSTALL) -m 755 -d $(SYSTEMD_SYSTEM_UNIT_DIR)
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 6/7] xfs_healer: don't leak wh->mntpoint if fd_to_handle fails
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
` (4 preceding siblings ...)
2026-06-02 4:59 ` [PATCH 5/7] xfs_healer: fix Makefile errors Darrick J. Wong
@ 2026-06-02 4:59 ` Darrick J. Wong
2026-06-02 6:33 ` Christoph Hellwig
2026-06-02 4:59 ` [PATCH 7/7] xfs_healer: coordinate access to weakhandle::mntpoint correctly Darrick J. Wong
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:59 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Codex noticed that we leak wh->mntpoint if fd_to_handle fails, so fix
that leak.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: 6f0f35a87c8797 ("xfs_healer: enable repairing filesystems")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/weakhandle.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 506689aa77637e..6c0dbb301c4f7d 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -63,11 +63,13 @@ weakhandle_alloc(
ret = fd_to_handle(fd, &wh->hanp, &wh->hlen);
if (ret)
- goto out_wh;
+ goto out_mntpt;
*whp = wh;
return 0;
+out_mntpt:
+ free(wh->mntpoint);
out_wh:
free(wh);
return -1;
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH 7/7] xfs_healer: coordinate access to weakhandle::mntpoint correctly
2026-06-02 4:58 [PATCHSET] xfs_healer: codex-inspired bug fixes for 7.1 Darrick J. Wong
` (5 preceding siblings ...)
2026-06-02 4:59 ` [PATCH 6/7] xfs_healer: don't leak wh->mntpoint if fd_to_handle fails Darrick J. Wong
@ 2026-06-02 4:59 ` Darrick J. Wong
2026-06-02 6:34 ` Christoph Hellwig
6 siblings, 1 reply; 15+ messages in thread
From: Darrick J. Wong @ 2026-06-02 4:59 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, linux-xfs, hch
From: Darrick J. Wong <djwong@kernel.org>
Codex complained about a potential UAF in the weakhandle code if one
thread is walking weakhandle::mntpoint at exactly the same instant that
another thread calls try_update_mntpoint to update the mntpoint pointer
and free the old mntpoint string. This can only happen if the sysadmin
decides to move the mountpoint while the kernel is logging runtime
errors and healer is trying to fix them, but it /is/ a UAF.
Fix this by transforming the existing static lock into a rwlock and
hoisting it to struct weakhandle. Then make sure that we take it for
all accesses to the mntpoint string.
Cc: <linux-xfs@vger.kernel.org> # v7.0.0
Fixes: 1a5eeddf738c80 ("xfs_healer: update weakhandle mountpoint when reconnecting")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
healer/weakhandle.c | 61 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 43 insertions(+), 18 deletions(-)
diff --git a/healer/weakhandle.c b/healer/weakhandle.c
index 6c0dbb301c4f7d..9aa8859881481b 100644
--- a/healer/weakhandle.c
+++ b/healer/weakhandle.c
@@ -18,6 +18,9 @@
#include "xfs_healer.h"
struct weakhandle {
+ /* Synchronizes accesses to mntpoint */
+ pthread_rwlock_t lock;
+
/* Owned reference to the user's mountpoint for logging */
char *mntpoint;
@@ -61,13 +64,21 @@ weakhandle_alloc(
wh->mnt_id = mnt_id;
wh->fsname = fsname;
- ret = fd_to_handle(fd, &wh->hanp, &wh->hlen);
- if (ret)
+ ret = pthread_rwlock_init(&wh->lock, NULL);
+ if (ret) {
+ errno = ret;
goto out_mntpt;
+ }
+
+ ret = fd_to_handle(fd, &wh->hanp, &wh->hlen);
+ if (ret)
+ goto out_rwlock;
*whp = wh;
return 0;
+out_rwlock:
+ pthread_rwlock_destroy(&wh->lock);
out_mntpt:
free(wh->mntpoint);
out_wh:
@@ -80,19 +91,18 @@ try_update_mntpoint(
struct weakhandle *wh,
const char *path)
{
- static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
char *s = strdup(path);
if (!s)
return;
- pthread_mutex_lock(&lock);
+ pthread_rwlock_wrlock(&wh->lock);
if (path != wh->mntpoint) {
free(wh->mntpoint);
wh->mntpoint = s;
s = NULL;
}
- pthread_mutex_unlock(&lock);
+ pthread_rwlock_unlock(&wh->lock);
free(s);
}
@@ -134,9 +144,6 @@ weakhandle_reopen_from(
goto out_handle;
}
- if (path != wh->mntpoint)
- try_update_mntpoint(wh, path);
-
free_handle(hanp, hlen);
*fd = mnt_fd;
return 0;
@@ -164,7 +171,9 @@ weakhandle_reopen(
int ret;
/* First try reopening using the original mountpoint */
+ pthread_rwlock_rdlock(&wh->lock);
ret = weakhandle_reopen_from(wh, wh->mntpoint, fd, is_acceptable, data);
+ pthread_rwlock_unlock(&wh->lock);
if (!ret)
return 0;
@@ -180,8 +189,10 @@ weakhandle_reopen(
goto fallback;
ret = weakhandle_reopen_from(wh, smbuf->str + smbuf->mnt_point, fd,
is_acceptable, data);
- if (!ret)
+ if (!ret) {
+ try_update_mntpoint(wh, smbuf->str + smbuf->mnt_point);
return 0;
+ }
fallback:
/*
@@ -200,8 +211,10 @@ weakhandle_reopen(
ret = weakhandle_reopen_from(wh, mnt->mnt_dir, fd,
is_acceptable, data);
- if (!ret)
+ if (!ret) {
+ try_update_mntpoint(wh, mnt->mnt_dir);
break;
+ }
}
if (*fd < 0) {
@@ -221,6 +234,7 @@ weakhandle_free(
struct weakhandle *wh = *whp;
if (wh) {
+ pthread_rwlock_destroy(&wh->lock);
free_handle(wh->hanp, wh->hlen);
free(wh->mntpoint);
free(wh);
@@ -230,25 +244,30 @@ weakhandle_free(
}
struct bufvec {
- char *buf;
- size_t len;
+ struct weakhandle *wh;
+ char *buf;
+ size_t len;
};
static int
render_path(
- const char *mntpt,
+ const char *donotuse,
const struct path_list *path,
void *arg)
{
struct bufvec *args = arg;
- int mntpt_len = strlen(mntpt);
+ struct weakhandle *wh = args->wh;
+ int mntpt_len;
ssize_t ret;
/* Trim trailing slashes from the mountpoint */
- while (mntpt_len > 0 && mntpt[mntpt_len - 1] == '/')
+ pthread_rwlock_rdlock(&wh->lock);
+ mntpt_len = strlen(wh->mntpoint);
+ while (mntpt_len > 0 && wh->mntpoint[mntpt_len - 1] == '/')
mntpt_len--;
- ret = snprintf(args->buf, args->len, "%.*s", mntpt_len, mntpt);
+ ret = snprintf(args->buf, args->len, "%.*s", mntpt_len, wh->mntpoint);
+ pthread_rwlock_unlock(&wh->lock);
if (ret < 0 || ret >= args->len)
return 0;
@@ -271,6 +290,7 @@ weakhandle_getpath_for(
{
struct xfs_handle fakehandle;
struct bufvec bv = {
+ .wh = wh,
.buf = path,
.len = pathlen,
};
@@ -295,7 +315,7 @@ weakhandle_getpath_for(
* path that goes to the rootdir? With a max filename length of 255
* bytes, we pick 600 for the buffer size.
*/
- ret = handle_walk_paths_fd(wh->mntpoint, mnt_fd, &fakehandle,
+ ret = handle_walk_paths_fd(NULL, mnt_fd, &fakehandle,
sizeof(fakehandle), 600, render_path, &bv);
switch (ret) {
case ECANCELED:
@@ -321,6 +341,11 @@ weakhandle_instance_unit_name(
char *unitname,
size_t unitnamelen)
{
- return systemd_path_instance_unit_name(template, wh->mntpoint,
+ int ret;
+
+ pthread_rwlock_rdlock(&wh->lock);
+ ret = systemd_path_instance_unit_name(template, wh->mntpoint,
unitname, unitnamelen);
+ pthread_rwlock_unlock(&wh->lock);
+ return ret;
}
^ permalink raw reply related [flat|nested] 15+ messages in thread