* [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
@ 2026-02-08 13:18 Ionut Nechita (Wind River)
2026-02-09 23:03 ` Viacheslav Dubeyko
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-08 13:18 UTC (permalink / raw)
To: Ilya Dryomov, Alex Markuze, Viacheslav Dubeyko
Cc: Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
ceph-devel, linux-kernel, linux-rt-devel, Ionut Nechita,
Ionut Nechita, Xiubo Li, Jeff Layton, superm1, jkosina
From: Ionut Nechita <ionut.nechita@windriver.com>
When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_mdsc_sync(). The hung_task detector fires
repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
INFO: task sync:12345 blocked for more than 122 seconds.
Call Trace:
ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
ceph_sync_fs+0x31/0x130 [ceph]
iterate_supers+0x97/0x100
ksys_sync+0x32/0xb0
Three functions in the MDS sync path use indefinite waits:
1. wait_caps_flush() uses wait_event() with no timeout
2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
wait_for_completion() with no timeout
3. ceph_mdsc_sync() returns void, cannot propagate errors
This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary MDS unavailability.
Fix this by adding mount_timeout-based timeouts (default 60s) to the
blocking waits, following the existing pattern used by wait_requests()
and ceph_mdsc_close_sessions() in the same file:
- wait_caps_flush(): use wait_event_timeout() with mount_timeout
- flush_mdlog_and_wait_mdsc_unsafe_requests(): use
wait_for_completion_timeout() with mount_timeout
- ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
- ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
On timeout, dirty caps and pending requests are NOT discarded - they
remain in memory and are re-synced when MDS reconnects. The timeout
simply unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.
Real-world impact: In production logs showing 'task sync blocked for
more than 983 seconds', this patch limits the block to mount_timeout
(60s default), returning -ETIMEDOUT to the VFS layer instead of
hanging indefinitely.
Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
fs/ceph/mds_client.h | 2 +-
fs/ceph/super.c | 5 +++--
3 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7e4eab824daef..4cd8f584147f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
*
* returns true if we've flushed through want_flush_tid
*/
-static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid)
+static int wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
{
struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_options *opts = mdsc->fsc->client->options;
+ long ret;
doutc(cl, "want %llu\n", want_flush_tid);
- wait_event(mdsc->cap_flushing_wq,
- check_caps_flush(mdsc, want_flush_tid));
+ ret = wait_event_timeout(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid),
+ ceph_timeout_jiffies(opts->mount_timeout));
+ if (!ret) {
+ pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
+ want_flush_tid);
+ return -ETIMEDOUT;
+ }
doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
+ return 0;
}
/*
@@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
/*
* flush the mdlog and wait for all write mds requests to flush.
*/
-static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
- u64 want_tid)
+static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req = NULL, *nextreq;
struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
+ unsigned long left;
mutex_lock(&mdsc->mutex);
doutc(cl, "want %lld\n", want_tid);
@@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
}
doutc(cl, "wait on %llu (want %llu)\n",
req->r_tid, want_tid);
- wait_for_completion(&req->r_safe_completion);
+ left = wait_for_completion_timeout(
+ &req->r_safe_completion,
+ ceph_timeout_jiffies(opts->mount_timeout));
+ if (!left) {
+ pr_warn_client(cl,
+ "flush mdlog request tid %llu timed out\n",
+ req->r_tid);
+ ceph_mdsc_put_request(req);
+ if (nextreq)
+ ceph_mdsc_put_request(nextreq);
+ ceph_put_mds_session(last_session);
+ return -ETIMEDOUT;
+ }
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
@@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
mutex_unlock(&mdsc->mutex);
ceph_put_mds_session(last_session);
doutc(cl, "done\n");
+ return 0;
}
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
struct ceph_client *cl = mdsc->fsc->client;
u64 want_tid, want_flush;
+ int ret;
if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
- return;
+ return -EIO;
doutc(cl, "sync\n");
mutex_lock(&mdsc->mutex);
@@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
- flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush);
+ ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
+ if (ret)
+ return ret;
+
+ return wait_caps_flush(mdsc, want_flush);
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0428a5eaf28c6..a8b72cb13de1f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c1c1dac320da..6b0ad7a455815 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
{
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ int ret;
if (!wait) {
doutc(cl, "(non-blocking)\n");
@@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
doutc(cl, "(blocking)\n");
ceph_osdc_sync(&fsc->client->osdc);
- ceph_mdsc_sync(fsc->mdsc);
+ ret = ceph_mdsc_sync(fsc->mdsc);
doutc(cl, "(blocking) done\n");
- return 0;
+ return ret;
}
/*
--
2.52.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
@ 2026-02-09 23:03 ` Viacheslav Dubeyko
2026-02-11 7:21 ` Sebastian Andrzej Siewior
2026-02-13 7:51 ` Ionut Nechita (Wind River)
2 siblings, 0 replies; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-09 23:03 UTC (permalink / raw)
To: idryomov@gmail.com, Alex Markuze, slava@dubeyko.com,
ionut.nechita@windriver.com
Cc: ionut_n2001@yahoo.com, Xiubo Li, linux-rt-devel@lists.linux.dev,
ceph-devel@vger.kernel.org, rostedt@goodmis.org,
linux-kernel@vger.kernel.org, bigeasy@linutronix.de,
clrkwllms@kernel.org, superm1@kernel.org, jlayton@kernel.org,
jkosina@suse.com
On Sun, 2026-02-08 at 15:18 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
>
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
Do you have any reproduction path of this? Which particular use-case can trigger
the issue with higher probability?
Maybe, do we need to find the real reason of the issue? It sounds to me like a
workaround for some real issue(s).
>
> INFO: task sync:12345 blocked for more than 122 seconds.
> Call Trace:
> ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
> ceph_sync_fs+0x31/0x130 [ceph]
> iterate_supers+0x97/0x100
> ksys_sync+0x32/0xb0
>
> Three functions in the MDS sync path use indefinite waits:
>
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
> wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
>
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.
If it is temporary MDS unavailability, then, I assume that libceph should manage
this situation. Am I wrong here? I expect that libceph should wake up the
waiting flushing threads. Do I oversimplify the whole workflow? :)
>
> Fix this by adding mount_timeout-based timeouts (default 60s) to the
> blocking waits, following the existing pattern used by wait_requests()
> and ceph_mdsc_close_sessions() in the same file:
Maybe, you are right that we need to use the timeout based approach. But what
was the reason that flushing threads haven't been woken up?
Thanks,
Slava.
>
> - wait_caps_flush(): use wait_event_timeout() with mount_timeout
> - flush_mdlog_and_wait_mdsc_unsafe_requests(): use
> wait_for_completion_timeout() with mount_timeout
> - ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
> - ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
>
> On timeout, dirty caps and pending requests are NOT discarded - they
> remain in memory and are re-synced when MDS reconnects. The timeout
> simply unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.
>
> Real-world impact: In production logs showing 'task sync blocked for
> more than 983 seconds', this patch limits the block to mount_timeout
> (60s default), returning -ETIMEDOUT to the VFS layer instead of
> hanging indefinitely.
>
> Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
> fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
> fs/ceph/mds_client.h | 2 +-
> fs/ceph/super.c | 5 +++--
> 3 files changed, 43 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 7e4eab824daef..4cd8f584147f4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
> *
> * returns true if we've flushed through want_flush_tid
> */
> -static void wait_caps_flush(struct ceph_mds_client *mdsc,
> - u64 want_flush_tid)
> +static int wait_caps_flush(struct ceph_mds_client *mdsc,
> + u64 want_flush_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> + long ret;
>
> doutc(cl, "want %llu\n", want_flush_tid);
>
> - wait_event(mdsc->cap_flushing_wq,
> - check_caps_flush(mdsc, want_flush_tid));
> + ret = wait_event_timeout(mdsc->cap_flushing_wq,
> + check_caps_flush(mdsc, want_flush_tid),
> + ceph_timeout_jiffies(opts->mount_timeout));
> + if (!ret) {
> + pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
> + want_flush_tid);
> + return -ETIMEDOUT;
> + }
>
> doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
> + return 0;
> }
>
> /*
> @@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> /*
> * flush the mdlog and wait for all write mds requests to flush.
> */
> -static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> - u64 want_tid)
> +static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> + u64 want_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> struct ceph_mds_request *req = NULL, *nextreq;
> struct ceph_mds_session *last_session = NULL;
> struct rb_node *n;
> + unsigned long left;
>
> mutex_lock(&mdsc->mutex);
> doutc(cl, "want %lld\n", want_tid);
> @@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> }
> doutc(cl, "wait on %llu (want %llu)\n",
> req->r_tid, want_tid);
> - wait_for_completion(&req->r_safe_completion);
> + left = wait_for_completion_timeout(
> + &req->r_safe_completion,
> + ceph_timeout_jiffies(opts->mount_timeout));
> + if (!left) {
> + pr_warn_client(cl,
> + "flush mdlog request tid %llu timed out\n",
> + req->r_tid);
> + ceph_mdsc_put_request(req);
> + if (nextreq)
> + ceph_mdsc_put_request(nextreq);
> + ceph_put_mds_session(last_session);
> + return -ETIMEDOUT;
> + }
>
> mutex_lock(&mdsc->mutex);
> ceph_mdsc_put_request(req);
> @@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> mutex_unlock(&mdsc->mutex);
> ceph_put_mds_session(last_session);
> doutc(cl, "done\n");
> + return 0;
> }
>
> -void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> +int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> u64 want_tid, want_flush;
> + int ret;
>
> if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
> - return;
> + return -EIO;
>
> doutc(cl, "sync\n");
> mutex_lock(&mdsc->mutex);
> @@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
>
> doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
>
> - flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> - wait_caps_flush(mdsc, want_flush);
> + ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> + if (ret)
> + return ret;
> +
> + return wait_caps_flush(mdsc, want_flush);
> }
>
> /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0428a5eaf28c6..a8b72cb13de1f 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
>
> -extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
> +extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
>
> extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
> extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 7c1c1dac320da..6b0ad7a455815 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
> {
> struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
> struct ceph_client *cl = fsc->client;
> + int ret;
>
> if (!wait) {
> doutc(cl, "(non-blocking)\n");
> @@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>
> doutc(cl, "(blocking)\n");
> ceph_osdc_sync(&fsc->client->osdc);
> - ceph_mdsc_sync(fsc->mdsc);
> + ret = ceph_mdsc_sync(fsc->mdsc);
> doutc(cl, "(blocking) done\n");
> - return 0;
> + return ret;
> }
>
> /*
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
2026-02-09 23:03 ` Viacheslav Dubeyko
@ 2026-02-11 7:21 ` Sebastian Andrzej Siewior
2026-02-13 7:51 ` Ionut Nechita (Wind River)
2 siblings, 0 replies; 8+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-02-11 7:21 UTC (permalink / raw)
To: Ionut Nechita (Wind River)
Cc: Ilya Dryomov, Alex Markuze, Viacheslav Dubeyko, Clark Williams,
Steven Rostedt, ceph-devel, linux-kernel, linux-rt-devel,
Ionut Nechita, Xiubo Li, Jeff Layton, superm1, jkosina
On 2026-02-08 15:18:20 [+0200], Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
>
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
>
> INFO: task sync:12345 blocked for more than 122 seconds.
> Call Trace:
> ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
> ceph_sync_fs+0x31/0x130 [ceph]
> iterate_supers+0x97/0x100
> ksys_sync+0x32/0xb0
>
> Three functions in the MDS sync path use indefinite waits:
>
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
> wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
>
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.
I may have misunderstood this but how is this different from a
!PREEMPT_RT kernel? As far as I understand, there should be no
difference in how both kernels react to the situation.
Could you check with lockdep and might_sleep if there a locking problem
and some kind of state is lost or wrongly interpreted?
Sebastian
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
2026-02-09 23:03 ` Viacheslav Dubeyko
2026-02-11 7:21 ` Sebastian Andrzej Siewior
@ 2026-02-13 7:51 ` Ionut Nechita (Wind River)
2026-02-17 21:52 ` Viacheslav Dubeyko
2 siblings, 1 reply; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-13 7:51 UTC (permalink / raw)
To: idryomov
Cc: amarkuze, bigeasy, ceph-devel, clrkwllms, ionut_n2001, jkosina,
jlayton, linux-kernel, linux-rt-devel, rostedt, sage, slava,
superm1, xiubli, Ionut Nechita
I also created a tracker issue for this on the Ceph bug tracker:
https://tracker.ceph.com/issues/74897
Thanks,
Ionut
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-13 7:51 ` Ionut Nechita (Wind River)
@ 2026-02-17 21:52 ` Viacheslav Dubeyko
2026-02-18 19:57 ` Ionut Nechita (Wind River)
0 siblings, 1 reply; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-17 21:52 UTC (permalink / raw)
To: idryomov@gmail.com, ionut.nechita@windriver.com
Cc: ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
linux-rt-devel@lists.linux.dev, jkosina@suse.com,
ceph-devel@vger.kernel.org, rostedt@goodmis.org,
linux-kernel@vger.kernel.org, slava@dubeyko.com, Alex Markuze,
jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
superm1@kernel.org
On Fri, 2026-02-13 at 09:51 +0200, Ionut Nechita (Wind River) wrote:
> I also created a tracker issue for this on the Ceph bug tracker:
>
> https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDaQ&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=oVQ3XXnnOXYdQh1XLw3tF7NQtVn2RbspKR87xKMX9OaXwxMeG5-j9NZql6OVPhi1&s=RCmrpV6SMVfjurhivjMyHRm_bDekVEQl_uIhD5hbtno&e=
>
It looks like that I was able to reproduce the symptoms of the issue by multiple
runs of generic/013 xfstests' test-case:
#!/bin/bash
while true; do
sudo ./check generic/013
done
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.346895] INFO: task fsstress:14466
blocked for more than 122 seconds.
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.347995] Not tainted 6.19.0-rc8+
#10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.348530] "echo 0 >
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349426] task:fsstress state:D
stack:0 pid:14466 tgid:14466 ppid:14464 task
_flags:0x400140 flags:0x00080800
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349438] Call Trace:
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349441] <TASK>
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349445] __schedule+0xe8a/0x57f0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349457] ? kasan_save_stack+0x39/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349466] ? kasan_save_stack+0x26/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349471] ? kasan_save_track+0x14/0x40
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349475] ?
kasan_save_free_info+0x3b/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349485] ? __kasan_slab_free+0x7a/0xb0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349489] ?
ceph_mdsc_release_request+0x6a3/0x880
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349497] ?
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349502] ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349507] ?
__pv_queued_spin_lock_slowpath+0xb04/0xf80
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349514] ? __pfx___schedule+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349520] ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349525] ?
__call_rcu_common+0x386/0x14b0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349532] schedule+0x75/0x2f0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349538] schedule_timeout+0x16d/0x210
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349542] ?
__pfx_schedule_timeout+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349548] ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349553] ?
_raw_spin_lock_irq+0x8b/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349559] ?
__pfx__raw_spin_lock_irq+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349565] ? kasan_save_track+0x14/0x40
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349569]
wait_for_completion+0x14a/0x340
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349573] ?
__pfx_wait_for_completion+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349577] ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349582] ? __pfx_mutex_unlock+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349587] ceph_mdsc_sync+0x4b4/0xe80
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349593] ?
__pfx_ceph_mdsc_sync+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349597] ?
ceph_osdc_put_request+0x38/0x770
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349603] ? ceph_osdc_sync+0x1cb/0x350
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349608] ceph_sync_fs+0xa0/0x4c0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349612] sync_filesystem+0x182/0x240
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349618] __x64_sys_syncfs+0xac/0x160
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349623] x64_sys_call+0x746/0x2360
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349629] do_syscall_64+0x82/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349635] ? __x64_sys_openat+0x108/0x240
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349641] ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349647] ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349655] ?
__pfx___x64_sys_openat+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349661] ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349667] ? ksys_write+0x1a3/0x230
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349672] ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349677] ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349682] ? do_syscall_64+0xbf/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349687] ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349692] ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349705] ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349709] ? do_syscall_64+0xbf/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349715] ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349720] ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349724] ? irqentry_exit+0xa5/0x600
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349730] ? exc_page_fault+0x95/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349736]
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349740] RIP: 0033:0x792fb1d1ba4b
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349745] RSP: 002b:00007ffc3844eb58
EFLAGS: 00000246 ORIG_RAX: 0000000000000132
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349752] RAX: ffffffffffffffda RBX:
0000000000000000 RCX: 0000792fb1d1ba4b
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349756] RDX: 0000000000000000 RSI:
000059045610b440 RDI: 0000000000000004
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349759] RBP: 0000000000000004 R08:
0000000000000026 R09: 00007ffc3844e986
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349762] R10: 0000000000000000 R11:
0000000000000246 R12: 0000000000000149
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349765] R13: 00007ffc3844eba0 R14:
000059042de9d0b3 R15: 0000000000000149
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349771] </TASK>
l *ceph_mdsc_sync+0x4b4
0xffffffff82cddbe4 is in ceph_mdsc_sync (fs/ceph/mds_client.c:5916).
5911 }
5912 doutc(cl, "wait on %llu (want %llu)\n",
5913 req->r_tid, want_tid);
5914 wait_for_completion(&req->r_safe_completion);
5915
5916 mutex_lock(&mdsc->mutex);
5917 ceph_mdsc_put_request(req);
5918 if (!nextreq)
5919 break; /* next dne before, so we're done! */
5920 if (RB_EMPTY_NODE(&nextreq->r_node)) {
I am not sure yet that reason is the same.
Thanks,
Slava.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-17 21:52 ` Viacheslav Dubeyko
@ 2026-02-18 19:57 ` Ionut Nechita (Wind River)
2026-02-18 20:04 ` Viacheslav Dubeyko
0 siblings, 1 reply; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-18 19:57 UTC (permalink / raw)
To: Viacheslav Dubeyko, Ilya Dryomov
Cc: ceph-devel, linux-kernel, linux-rt-devel, Alex Markuze, Xiubo Li,
sage, jlayton, rostedt, bigeasy, clrkwllms, superm1, jkosina,
ionut_n2001
Hi Slava,
Thanks for testing and reproducing this with generic/013.
Looking at the stack trace you shared:
ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
ceph_sync_fs
sync_filesystem
__x64_sys_syncfs
This is the same pattern we see in the original report - the sync path
blocks indefinitely on wait_for_completion() with no timeout. In your
case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
exactly what patch 2/3 ("ceph: add timeout protection to
ceph_mdsc_sync() path") addresses.
The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
but the symptom and the fix are the same - these wait_for_completion()
calls in the sync path need timeout protection regardless of what causes
the underlying delay.
All three patches are now also on LKML:
1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
3/3 - ceph: add timeout protection to ceph_osdc_sync() path
I've also added more details and debug information to the Ceph tracker
issue at https://tracker.ceph.com/issues/74897 - it might help with
your investigation.
Thanks,
Ionut
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-18 19:57 ` Ionut Nechita (Wind River)
@ 2026-02-18 20:04 ` Viacheslav Dubeyko
2026-02-19 9:37 ` Alex Markuze
0 siblings, 1 reply; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-18 20:04 UTC (permalink / raw)
To: idryomov@gmail.com, ionut.nechita@windriver.com
Cc: ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
linux-rt-devel@lists.linux.dev, ceph-devel@vger.kernel.org,
rostedt@goodmis.org, linux-kernel@vger.kernel.org, Alex Markuze,
jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
superm1@kernel.org, jkosina@suse.com
On Wed, 2026-02-18 at 21:57 +0200, Ionut Nechita (Wind River) wrote:
> Hi Slava,
>
> Thanks for testing and reproducing this with generic/013.
>
> Looking at the stack trace you shared:
>
> ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
> ceph_sync_fs
> sync_filesystem
> __x64_sys_syncfs
>
> This is the same pattern we see in the original report - the sync path
> blocks indefinitely on wait_for_completion() with no timeout. In your
> case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
> exactly what patch 2/3 ("ceph: add timeout protection to
> ceph_mdsc_sync() path") addresses.
>
> The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
> but the symptom and the fix are the same - these wait_for_completion()
> calls in the sync path need timeout protection regardless of what causes
> the underlying delay.
>
> All three patches are now also on LKML:
>
> 1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
> 2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
> 3/3 - ceph: add timeout protection to ceph_osdc_sync() path
>
> I've also added more details and debug information to the Ceph tracker
> issue at https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDAg&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=ty8gR4OjrwFXUQPjG9Dm4EapzO4Hwyj7aAX6INwulJY0GjoU0pTf7khYkXwrksDT&s=3fOX8o4od1TodhJw8SHCNjR4huXJS6VsRFYD6791DWM&e= - it might help with
> your investigation.
Frankly speaking, I don't see the situation of blocked thread if I am adding
debug output. It looks like a race condition. And I am not sure now that adding
timeout is the proper fix. Probably, we have some issue that needs to be fixed
and timeout looks like workaround but not the fix. I don't think that I have the
IPv6/EADDRNOTAVAIL case on my side.
Thanks,
Slava.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
2026-02-18 20:04 ` Viacheslav Dubeyko
@ 2026-02-19 9:37 ` Alex Markuze
0 siblings, 0 replies; 8+ messages in thread
From: Alex Markuze @ 2026-02-19 9:37 UTC (permalink / raw)
To: Viacheslav Dubeyko
Cc: idryomov@gmail.com, ionut.nechita@windriver.com,
ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
linux-rt-devel@lists.linux.dev, ceph-devel@vger.kernel.org,
rostedt@goodmis.org, linux-kernel@vger.kernel.org,
jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
superm1@kernel.org, jkosina@suse.com
I tend to agree here with Slava, I don't support any Timeout as a
solution before we have an actual RCA.
On Wed, Feb 18, 2026 at 10:04 PM Viacheslav Dubeyko
<Slava.Dubeyko@ibm.com> wrote:
>
> On Wed, 2026-02-18 at 21:57 +0200, Ionut Nechita (Wind River) wrote:
> > Hi Slava,
> >
> > Thanks for testing and reproducing this with generic/013.
> >
> > Looking at the stack trace you shared:
> >
> > ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
> > ceph_sync_fs
> > sync_filesystem
> > __x64_sys_syncfs
> >
> > This is the same pattern we see in the original report - the sync path
> > blocks indefinitely on wait_for_completion() with no timeout. In your
> > case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
> > exactly what patch 2/3 ("ceph: add timeout protection to
> > ceph_mdsc_sync() path") addresses.
> >
> > The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
> > but the symptom and the fix are the same - these wait_for_completion()
> > calls in the sync path need timeout protection regardless of what causes
> > the underlying delay.
> >
> > All three patches are now also on LKML:
> >
> > 1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
> > 2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
> > 3/3 - ceph: add timeout protection to ceph_osdc_sync() path
> >
> > I've also added more details and debug information to the Ceph tracker
> > issue at https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDAg&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=ty8gR4OjrwFXUQPjG9Dm4EapzO4Hwyj7aAX6INwulJY0GjoU0pTf7khYkXwrksDT&s=3fOX8o4od1TodhJw8SHCNjR4huXJS6VsRFYD6791DWM&e= - it might help with
> > your investigation.
>
> Frankly speaking, I don't see the situation of blocked thread if I am adding
> debug output. It looks like a race condition. And I am not sure now that adding
> timeout is the proper fix. Probably, we have some issue that needs to be fixed
> and timeout looks like workaround but not the fix. I don't think that I have the
> IPv6/EADDRNOTAVAIL case on my side.
>
> Thanks,
> Slava.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2026-02-19 9:37 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
2026-02-09 23:03 ` Viacheslav Dubeyko
2026-02-11 7:21 ` Sebastian Andrzej Siewior
2026-02-13 7:51 ` Ionut Nechita (Wind River)
2026-02-17 21:52 ` Viacheslav Dubeyko
2026-02-18 19:57 ` Ionut Nechita (Wind River)
2026-02-18 20:04 ` Viacheslav Dubeyko
2026-02-19 9:37 ` Alex Markuze
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox