[PATCH] ceph: add timeout protection to ceph_mdsc

Linux real-time development
 help / color / mirror / Atom feed

* [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
@ 2026-02-08 13:18 Ionut Nechita (Wind River)
  2026-02-09 23:03 ` Viacheslav Dubeyko
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-08 13:18 UTC (permalink / raw)
  To: Ilya Dryomov, Alex Markuze, Viacheslav Dubeyko
  Cc: Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	ceph-devel, linux-kernel, linux-rt-devel, Ionut Nechita,
	Ionut Nechita, Xiubo Li, Jeff Layton, superm1, jkosina

From: Ionut Nechita <ionut.nechita@windriver.com>

When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
during DAD or network transitions), the sync syscall can block
indefinitely in ceph_mdsc_sync(). The hung_task detector fires
repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:

  INFO: task sync:12345 blocked for more than 122 seconds.
  Call Trace:
    ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
    ceph_sync_fs+0x31/0x130 [ceph]
    iterate_supers+0x97/0x100
    ksys_sync+0x32/0xb0

Three functions in the MDS sync path use indefinite waits:

1. wait_caps_flush() uses wait_event() with no timeout
2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
   wait_for_completion() with no timeout
3. ceph_mdsc_sync() returns void, cannot propagate errors

This is particularly problematic in Kubernetes environments with
PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
and IPv6 network reconfigurations cause temporary MDS unavailability.

Fix this by adding mount_timeout-based timeouts (default 60s) to the
blocking waits, following the existing pattern used by wait_requests()
and ceph_mdsc_close_sessions() in the same file:

- wait_caps_flush(): use wait_event_timeout() with mount_timeout
- flush_mdlog_and_wait_mdsc_unsafe_requests(): use
  wait_for_completion_timeout() with mount_timeout
- ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
- ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS

On timeout, dirty caps and pending requests are NOT discarded - they
remain in memory and are re-synced when MDS reconnects. The timeout
simply unblocks the calling task. If mount_timeout is set to 0,
ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
original infinite-wait behavior.

Real-world impact: In production logs showing 'task sync blocked for
more than 983 seconds', this patch limits the block to mount_timeout
(60s default), returning -ETIMEDOUT to the VFS layer instead of
hanging indefinitely.

Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
---
 fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/super.c      |  5 +++--
 3 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7e4eab824daef..4cd8f584147f4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
  *
  * returns true if we've flushed through want_flush_tid
  */
-static void wait_caps_flush(struct ceph_mds_client *mdsc,
-			    u64 want_flush_tid)
+static int wait_caps_flush(struct ceph_mds_client *mdsc,
+			   u64 want_flush_tid)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
+	struct ceph_options *opts = mdsc->fsc->client->options;
+	long ret;
 
 	doutc(cl, "want %llu\n", want_flush_tid);
 
-	wait_event(mdsc->cap_flushing_wq,
-		   check_caps_flush(mdsc, want_flush_tid));
+	ret = wait_event_timeout(mdsc->cap_flushing_wq,
+				 check_caps_flush(mdsc, want_flush_tid),
+				 ceph_timeout_jiffies(opts->mount_timeout));
+	if (!ret) {
+		pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
+			       want_flush_tid);
+		return -ETIMEDOUT;
+	}
 
 	doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
+	return 0;
 }
 
 /*
@@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 /*
  * flush the mdlog and wait for all write mds requests to flush.
  */
-static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
-						 u64 want_tid)
+static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+						      u64 want_tid)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req = NULL, *nextreq;
 	struct ceph_mds_session *last_session = NULL;
 	struct rb_node *n;
+	unsigned long left;
 
 	mutex_lock(&mdsc->mutex);
 	doutc(cl, "want %lld\n", want_tid);
@@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
 			}
 			doutc(cl, "wait on %llu (want %llu)\n",
 			      req->r_tid, want_tid);
-			wait_for_completion(&req->r_safe_completion);
+			left = wait_for_completion_timeout(
+					&req->r_safe_completion,
+					ceph_timeout_jiffies(opts->mount_timeout));
+			if (!left) {
+				pr_warn_client(cl,
+					       "flush mdlog request tid %llu timed out\n",
+					       req->r_tid);
+				ceph_mdsc_put_request(req);
+				if (nextreq)
+					ceph_mdsc_put_request(nextreq);
+				ceph_put_mds_session(last_session);
+				return -ETIMEDOUT;
+			}
 
 			mutex_lock(&mdsc->mutex);
 			ceph_mdsc_put_request(req);
@@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
 	mutex_unlock(&mdsc->mutex);
 	ceph_put_mds_session(last_session);
 	doutc(cl, "done\n");
+	return 0;
 }
 
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
 	u64 want_tid, want_flush;
+	int ret;
 
 	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
-		return;
+		return -EIO;
 
 	doutc(cl, "sync\n");
 	mutex_lock(&mdsc->mutex);
@@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 
 	doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 
-	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
-	wait_caps_flush(mdsc, want_flush);
+	ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
+	if (ret)
+		return ret;
+
+	return wait_caps_flush(mdsc, want_flush);
 }
 
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0428a5eaf28c6..a8b72cb13de1f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c1c1dac320da..6b0ad7a455815 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_client *cl = fsc->client;
+	int ret;
 
 	if (!wait) {
 		doutc(cl, "(non-blocking)\n");
@@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
 
 	doutc(cl, "(blocking)\n");
 	ceph_osdc_sync(&fsc->client->osdc);
-	ceph_mdsc_sync(fsc->mdsc);
+	ret = ceph_mdsc_sync(fsc->mdsc);
 	doutc(cl, "(blocking) done\n");
-	return 0;
+	return ret;
 }
 
 /*
-- 
2.52.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re:  [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
@ 2026-02-09 23:03 ` Viacheslav Dubeyko
  2026-02-11  7:21 ` Sebastian Andrzej Siewior
  2026-02-13  7:51 ` Ionut Nechita (Wind River)
  2 siblings, 0 replies; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-09 23:03 UTC (permalink / raw)
  To: idryomov@gmail.com, Alex Markuze, slava@dubeyko.com,
	ionut.nechita@windriver.com
  Cc: ionut_n2001@yahoo.com, Xiubo Li, linux-rt-devel@lists.linux.dev,
	ceph-devel@vger.kernel.org, rostedt@goodmis.org,
	linux-kernel@vger.kernel.org, bigeasy@linutronix.de,
	clrkwllms@kernel.org, superm1@kernel.org, jlayton@kernel.org,
	jkosina@suse.com

On Sun, 2026-02-08 at 15:18 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
> 
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:

Do you have any reproduction path of this? Which particular use-case can trigger
the issue with higher probability?

Maybe, do we need to find the real reason of the issue? It sounds to me like a
workaround for some real issue(s).

> 
>   INFO: task sync:12345 blocked for more than 122 seconds.
>   Call Trace:
>     ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
>     ceph_sync_fs+0x31/0x130 [ceph]
>     iterate_supers+0x97/0x100
>     ksys_sync+0x32/0xb0
> 
> Three functions in the MDS sync path use indefinite waits:
> 
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
>    wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
> 
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.

If it is temporary MDS unavailability, then, I assume that libceph should manage
this situation. Am I wrong here? I expect that libceph should wake up the
waiting flushing threads. Do I oversimplify the whole workflow? :)

> 
> Fix this by adding mount_timeout-based timeouts (default 60s) to the
> blocking waits, following the existing pattern used by wait_requests()
> and ceph_mdsc_close_sessions() in the same file:


Maybe, you are right that we need to use the timeout based approach. But what
was the reason that flushing threads haven't been woken up?

Thanks,
Slava.

> 
> - wait_caps_flush(): use wait_event_timeout() with mount_timeout
> - flush_mdlog_and_wait_mdsc_unsafe_requests(): use
>   wait_for_completion_timeout() with mount_timeout
> - ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
> - ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
> 
> On timeout, dirty caps and pending requests are NOT discarded - they
> remain in memory and are re-synced when MDS reconnects. The timeout
> simply unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.
> 
> Real-world impact: In production logs showing 'task sync blocked for
> more than 983 seconds', this patch limits the block to mount_timeout
> (60s default), returning -ETIMEDOUT to the VFS layer instead of
> hanging indefinitely.
> 
> Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
> Signed-off-by: Ionut Nechita <ionut.nechita@windriver.com>
> ---
>  fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/super.c      |  5 +++--
>  3 files changed, 43 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 7e4eab824daef..4cd8f584147f4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2290,17 +2290,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
>   *
>   * returns true if we've flushed through want_flush_tid
>   */
> -static void wait_caps_flush(struct ceph_mds_client *mdsc,
> -			    u64 want_flush_tid)
> +static int wait_caps_flush(struct ceph_mds_client *mdsc,
> +			   u64 want_flush_tid)
>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
> +	struct ceph_options *opts = mdsc->fsc->client->options;
> +	long ret;
>  
>  	doutc(cl, "want %llu\n", want_flush_tid);
>  
> -	wait_event(mdsc->cap_flushing_wq,
> -		   check_caps_flush(mdsc, want_flush_tid));
> +	ret = wait_event_timeout(mdsc->cap_flushing_wq,
> +				 check_caps_flush(mdsc, want_flush_tid),
> +				 ceph_timeout_jiffies(opts->mount_timeout));
> +	if (!ret) {
> +		pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
> +			       want_flush_tid);
> +		return -ETIMEDOUT;
> +	}
>  
>  	doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
> +	return 0;
>  }
>  
>  /*
> @@ -5865,13 +5874,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>  /*
>   * flush the mdlog and wait for all write mds requests to flush.
>   */
> -static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> -						 u64 want_tid)
> +static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> +						      u64 want_tid)
>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
> +	struct ceph_options *opts = mdsc->fsc->client->options;
>  	struct ceph_mds_request *req = NULL, *nextreq;
>  	struct ceph_mds_session *last_session = NULL;
>  	struct rb_node *n;
> +	unsigned long left;
>  
>  	mutex_lock(&mdsc->mutex);
>  	doutc(cl, "want %lld\n", want_tid);
> @@ -5910,7 +5921,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
>  			}
>  			doutc(cl, "wait on %llu (want %llu)\n",
>  			      req->r_tid, want_tid);
> -			wait_for_completion(&req->r_safe_completion);
> +			left = wait_for_completion_timeout(
> +					&req->r_safe_completion,
> +					ceph_timeout_jiffies(opts->mount_timeout));
> +			if (!left) {
> +				pr_warn_client(cl,
> +					       "flush mdlog request tid %llu timed out\n",
> +					       req->r_tid);
> +				ceph_mdsc_put_request(req);
> +				if (nextreq)
> +					ceph_mdsc_put_request(nextreq);
> +				ceph_put_mds_session(last_session);
> +				return -ETIMEDOUT;
> +			}
>  
>  			mutex_lock(&mdsc->mutex);
>  			ceph_mdsc_put_request(req);
> @@ -5928,15 +5951,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
>  	mutex_unlock(&mdsc->mutex);
>  	ceph_put_mds_session(last_session);
>  	doutc(cl, "done\n");
> +	return 0;
>  }
>  
> -void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> +int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
>  	u64 want_tid, want_flush;
> +	int ret;
>  
>  	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
> -		return;
> +		return -EIO;
>  
>  	doutc(cl, "sync\n");
>  	mutex_lock(&mdsc->mutex);
> @@ -5957,8 +5982,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
>  
>  	doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
>  
> -	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> -	wait_caps_flush(mdsc, want_flush);
> +	ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> +	if (ret)
> +		return ret;
> +
> +	return wait_caps_flush(mdsc, want_flush);
>  }
>  
>  /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0428a5eaf28c6..a8b72cb13de1f 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -569,7 +569,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
>  extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
>  extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
>  
> -extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
> +extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
>  
>  extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
>  extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 7c1c1dac320da..6b0ad7a455815 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -125,6 +125,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>  {
>  	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
>  	struct ceph_client *cl = fsc->client;
> +	int ret;
>  
>  	if (!wait) {
>  		doutc(cl, "(non-blocking)\n");
> @@ -136,9 +137,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>  
>  	doutc(cl, "(blocking)\n");
>  	ceph_osdc_sync(&fsc->client->osdc);
> -	ceph_mdsc_sync(fsc->mdsc);
> +	ret = ceph_mdsc_sync(fsc->mdsc);
>  	doutc(cl, "(blocking) done\n");
> -	return 0;
> +	return ret;
>  }
>  
>  /*

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
  2026-02-09 23:03 ` Viacheslav Dubeyko
@ 2026-02-11  7:21 ` Sebastian Andrzej Siewior
  2026-02-13  7:51 ` Ionut Nechita (Wind River)
  2 siblings, 0 replies; 8+ messages in thread
From: Sebastian Andrzej Siewior @ 2026-02-11  7:21 UTC (permalink / raw)
  To: Ionut Nechita (Wind River)
  Cc: Ilya Dryomov, Alex Markuze, Viacheslav Dubeyko, Clark Williams,
	Steven Rostedt, ceph-devel, linux-kernel, linux-rt-devel,
	Ionut Nechita, Xiubo Li, Jeff Layton, superm1, jkosina

On 2026-02-08 15:18:20 [+0200], Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@windriver.com>
> 
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
> 
>   INFO: task sync:12345 blocked for more than 122 seconds.
>   Call Trace:
>     ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
>     ceph_sync_fs+0x31/0x130 [ceph]
>     iterate_supers+0x97/0x100
>     ksys_sync+0x32/0xb0
> 
> Three functions in the MDS sync path use indefinite waits:
> 
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
>    wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
> 
> This is particularly problematic in Kubernetes environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.

I may have misunderstood this but how is this different from a
!PREEMPT_RT kernel? As far as I understand, there should be no
difference in how both kernels react to the situation.
Could you check with lockdep and might_sleep if there a locking problem
and some kind of state is lost or wrongly interpreted?

Sebastian

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
  2026-02-09 23:03 ` Viacheslav Dubeyko
  2026-02-11  7:21 ` Sebastian Andrzej Siewior
@ 2026-02-13  7:51 ` Ionut Nechita (Wind River)
  2026-02-17 21:52   ` Viacheslav Dubeyko
  2 siblings, 1 reply; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-13  7:51 UTC (permalink / raw)
  To: idryomov
  Cc: amarkuze, bigeasy, ceph-devel, clrkwllms, ionut_n2001, jkosina,
	jlayton, linux-kernel, linux-rt-devel, rostedt, sage, slava,
	superm1, xiubli, Ionut Nechita

I also created a tracker issue for this on the Ceph bug tracker:

https://tracker.ceph.com/issues/74897

Thanks,
Ionut

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-13  7:51 ` Ionut Nechita (Wind River)
@ 2026-02-17 21:52   ` Viacheslav Dubeyko
  2026-02-18 19:57     ` Ionut Nechita (Wind River)
  0 siblings, 1 reply; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-17 21:52 UTC (permalink / raw)
  To: idryomov@gmail.com, ionut.nechita@windriver.com
  Cc: ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
	linux-rt-devel@lists.linux.dev, jkosina@suse.com,
	ceph-devel@vger.kernel.org, rostedt@goodmis.org,
	linux-kernel@vger.kernel.org, slava@dubeyko.com, Alex Markuze,
	jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
	superm1@kernel.org

On Fri, 2026-02-13 at 09:51 +0200, Ionut Nechita (Wind River) wrote:
> I also created a tracker issue for this on the Ceph bug tracker:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDaQ&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=oVQ3XXnnOXYdQh1XLw3tF7NQtVn2RbspKR87xKMX9OaXwxMeG5-j9NZql6OVPhi1&s=RCmrpV6SMVfjurhivjMyHRm_bDekVEQl_uIhD5hbtno&e= 
> 


It looks like that I was able to reproduce the symptoms of the issue by multiple
runs of generic/013 xfstests' test-case:

#!/bin/bash

while true; do
  sudo ./check generic/013
done

Feb 16 15:46:30 ceph-0005 kernel: [ 1845.346895] INFO: task fsstress:14466
blocked for more than 122 seconds.
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.347995]       Not tainted 6.19.0-rc8+
#10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.348530] "echo 0 >
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349426] task:fsstress        state:D
stack:0     pid:14466 tgid:14466 ppid:14464  task
_flags:0x400140 flags:0x00080800
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349438] Call Trace:
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349441]  <TASK>
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349445]  __schedule+0xe8a/0x57f0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349457]  ? kasan_save_stack+0x39/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349466]  ? kasan_save_stack+0x26/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349471]  ? kasan_save_track+0x14/0x40
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349475]  ?
kasan_save_free_info+0x3b/0x60
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349485]  ? __kasan_slab_free+0x7a/0xb0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349489]  ?
ceph_mdsc_release_request+0x6a3/0x880
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349497]  ?
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349502]  ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349507]  ?
__pv_queued_spin_lock_slowpath+0xb04/0xf80
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349514]  ? __pfx___schedule+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349520]  ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349525]  ?
__call_rcu_common+0x386/0x14b0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349532]  schedule+0x75/0x2f0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349538]  schedule_timeout+0x16d/0x210
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349542]  ?
__pfx_schedule_timeout+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349548]  ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349553]  ?
_raw_spin_lock_irq+0x8b/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349559]  ?
__pfx__raw_spin_lock_irq+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349565]  ? kasan_save_track+0x14/0x40
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349569]
wait_for_completion+0x14a/0x340
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349573]  ?
__pfx_wait_for_completion+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349577]  ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349582]  ? __pfx_mutex_unlock+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349587]  ceph_mdsc_sync+0x4b4/0xe80
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349593]  ?
__pfx_ceph_mdsc_sync+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349597]  ?
ceph_osdc_put_request+0x38/0x770
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349603]  ? ceph_osdc_sync+0x1cb/0x350
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349608]  ceph_sync_fs+0xa0/0x4c0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349612]  sync_filesystem+0x182/0x240
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349618]  __x64_sys_syncfs+0xac/0x160
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349623]  x64_sys_call+0x746/0x2360
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349629]  do_syscall_64+0x82/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349635]  ? __x64_sys_openat+0x108/0x240
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349641]  ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349647]  ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349655]  ?
__pfx___x64_sys_openat+0x10/0x10
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349661]  ?
__kasan_check_write+0x14/0x30
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349667]  ? ksys_write+0x1a3/0x230
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349672]  ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349677]  ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349682]  ? do_syscall_64+0xbf/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349687]  ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349692]  ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349705]  ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349709]  ? do_syscall_64+0xbf/0x5d0
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349715]  ? __kasan_check_read+0x11/0x20
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349720]  ?
fpregs_assert_state_consistent+0x5c/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349724]  ? irqentry_exit+0xa5/0x600
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349730]  ? exc_page_fault+0x95/0x100
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349736]
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349740] RIP: 0033:0x792fb1d1ba4b
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349745] RSP: 002b:00007ffc3844eb58
EFLAGS: 00000246 ORIG_RAX: 0000000000000132
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349752] RAX: ffffffffffffffda RBX:
0000000000000000 RCX: 0000792fb1d1ba4b
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349756] RDX: 0000000000000000 RSI:
000059045610b440 RDI: 0000000000000004
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349759] RBP: 0000000000000004 R08:
0000000000000026 R09: 00007ffc3844e986
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349762] R10: 0000000000000000 R11:
0000000000000246 R12: 0000000000000149
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349765] R13: 00007ffc3844eba0 R14:
000059042de9d0b3 R15: 0000000000000149
Feb 16 15:46:30 ceph-0005 kernel: [ 1845.349771]  </TASK>

l *ceph_mdsc_sync+0x4b4
0xffffffff82cddbe4 is in ceph_mdsc_sync (fs/ceph/mds_client.c:5916).
5911                }
5912                doutc(cl, "wait on %llu (want %llu)\n",
5913                      req->r_tid, want_tid);
5914                wait_for_completion(&req->r_safe_completion);
5915    
5916                mutex_lock(&mdsc->mutex);
5917                ceph_mdsc_put_request(req);
5918                if (!nextreq)
5919                    break;  /* next dne before, so we're done! */
5920                if (RB_EMPTY_NODE(&nextreq->r_node)) {

I am not sure yet that reason is the same.

Thanks,
Slava.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-17 21:52   ` Viacheslav Dubeyko
@ 2026-02-18 19:57     ` Ionut Nechita (Wind River)
  2026-02-18 20:04       ` Viacheslav Dubeyko
  0 siblings, 1 reply; 8+ messages in thread
From: Ionut Nechita (Wind River) @ 2026-02-18 19:57 UTC (permalink / raw)
  To: Viacheslav Dubeyko, Ilya Dryomov
  Cc: ceph-devel, linux-kernel, linux-rt-devel, Alex Markuze, Xiubo Li,
	sage, jlayton, rostedt, bigeasy, clrkwllms, superm1, jkosina,
	ionut_n2001

Hi Slava,

Thanks for testing and reproducing this with generic/013.

Looking at the stack trace you shared:

  ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
  ceph_sync_fs
  sync_filesystem
  __x64_sys_syncfs

This is the same pattern we see in the original report - the sync path
blocks indefinitely on wait_for_completion() with no timeout. In your
case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
exactly what patch 2/3 ("ceph: add timeout protection to
ceph_mdsc_sync() path") addresses.

The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
but the symptom and the fix are the same - these wait_for_completion()
calls in the sync path need timeout protection regardless of what causes
the underlying delay.

All three patches are now also on LKML:

  1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
  2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
  3/3 - ceph: add timeout protection to ceph_osdc_sync() path

I've also added more details and debug information to the Ceph tracker
issue at https://tracker.ceph.com/issues/74897 - it might help with
your investigation.

Thanks,
Ionut

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-18 19:57     ` Ionut Nechita (Wind River)
@ 2026-02-18 20:04       ` Viacheslav Dubeyko
  2026-02-19  9:37         ` Alex Markuze
  0 siblings, 1 reply; 8+ messages in thread
From: Viacheslav Dubeyko @ 2026-02-18 20:04 UTC (permalink / raw)
  To: idryomov@gmail.com, ionut.nechita@windriver.com
  Cc: ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
	linux-rt-devel@lists.linux.dev, ceph-devel@vger.kernel.org,
	rostedt@goodmis.org, linux-kernel@vger.kernel.org, Alex Markuze,
	jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
	superm1@kernel.org, jkosina@suse.com

On Wed, 2026-02-18 at 21:57 +0200, Ionut Nechita (Wind River) wrote:
> Hi Slava,
> 
> Thanks for testing and reproducing this with generic/013.
> 
> Looking at the stack trace you shared:
> 
>   ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
>   ceph_sync_fs
>   sync_filesystem
>   __x64_sys_syncfs
> 
> This is the same pattern we see in the original report - the sync path
> blocks indefinitely on wait_for_completion() with no timeout. In your
> case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
> exactly what patch 2/3 ("ceph: add timeout protection to
> ceph_mdsc_sync() path") addresses.
> 
> The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
> but the symptom and the fix are the same - these wait_for_completion()
> calls in the sync path need timeout protection regardless of what causes
> the underlying delay.
> 
> All three patches are now also on LKML:
> 
>   1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
>   2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
>   3/3 - ceph: add timeout protection to ceph_osdc_sync() path
> 
> I've also added more details and debug information to the Ceph tracker
> issue at https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDAg&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=ty8gR4OjrwFXUQPjG9Dm4EapzO4Hwyj7aAX6INwulJY0GjoU0pTf7khYkXwrksDT&s=3fOX8o4od1TodhJw8SHCNjR4huXJS6VsRFYD6791DWM&e=  - it might help with
> your investigation.

Frankly speaking, I don't see the situation of blocked thread if I am adding
debug output. It looks like a race condition. And I am not sure now that adding
timeout is the proper fix. Probably, we have some issue that needs to be fixed
and timeout looks like workaround but not the fix. I don't think that I have the
IPv6/EADDRNOTAVAIL case on my side.

Thanks,
Slava.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path
  2026-02-18 20:04       ` Viacheslav Dubeyko
@ 2026-02-19  9:37         ` Alex Markuze
  0 siblings, 0 replies; 8+ messages in thread
From: Alex Markuze @ 2026-02-19  9:37 UTC (permalink / raw)
  To: Viacheslav Dubeyko
  Cc: idryomov@gmail.com, ionut.nechita@windriver.com,
	ionut_n2001@yahoo.com, sage@newdream.net, Xiubo Li,
	linux-rt-devel@lists.linux.dev, ceph-devel@vger.kernel.org,
	rostedt@goodmis.org, linux-kernel@vger.kernel.org,
	jlayton@kernel.org, bigeasy@linutronix.de, clrkwllms@kernel.org,
	superm1@kernel.org, jkosina@suse.com

I tend to agree here with Slava, I don't support any Timeout as a
solution before we have an actual RCA.

On Wed, Feb 18, 2026 at 10:04 PM Viacheslav Dubeyko
<Slava.Dubeyko@ibm.com> wrote:
>
> On Wed, 2026-02-18 at 21:57 +0200, Ionut Nechita (Wind River) wrote:
> > Hi Slava,
> >
> > Thanks for testing and reproducing this with generic/013.
> >
> > Looking at the stack trace you shared:
> >
> >   ceph_mdsc_sync+0x4b4 -> wait_for_completion(&req->r_safe_completion)
> >   ceph_sync_fs
> >   sync_filesystem
> >   __x64_sys_syncfs
> >
> > This is the same pattern we see in the original report - the sync path
> > blocks indefinitely on wait_for_completion() with no timeout. In your
> > case it's ceph_mdsc_sync() hanging on r_safe_completion, which is
> > exactly what patch 2/3 ("ceph: add timeout protection to
> > ceph_mdsc_sync() path") addresses.
> >
> > The root cause may differ from the original IPv6/EADDRNOTAVAIL scenario,
> > but the symptom and the fix are the same - these wait_for_completion()
> > calls in the sync path need timeout protection regardless of what causes
> > the underlying delay.
> >
> > All three patches are now also on LKML:
> >
> >   1/3 - libceph: handle EADDRNOTAVAIL more gracefully (v2)
> >   2/3 - ceph: add timeout protection to ceph_mdsc_sync() path
> >   3/3 - ceph: add timeout protection to ceph_osdc_sync() path
> >
> > I've also added more details and debug information to the Ceph tracker
> > issue at https://urldefense.proofpoint.com/v2/url?u=https-3A__tracker.ceph.com_issues_74897&d=DwIDAg&c=BSDicqBQBDjDI9RkVyTcHQ&r=q5bIm4AXMzc8NJu1_RGmnQ2fMWKq4Y4RAkElvUgSs00&m=ty8gR4OjrwFXUQPjG9Dm4EapzO4Hwyj7aAX6INwulJY0GjoU0pTf7khYkXwrksDT&s=3fOX8o4od1TodhJw8SHCNjR4huXJS6VsRFYD6791DWM&e=  - it might help with
> > your investigation.
>
> Frankly speaking, I don't see the situation of blocked thread if I am adding
> debug output. It looks like a race condition. And I am not sure now that adding
> timeout is the proper fix. Probably, we have some issue that needs to be fixed
> and timeout looks like workaround but not the fix. I don't think that I have the
> IPv6/EADDRNOTAVAIL case on my side.
>
> Thanks,
> Slava.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-02-19  9:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-08 13:18 [PATCH] ceph: add timeout protection to ceph_mdsc_sync() path Ionut Nechita (Wind River)
2026-02-09 23:03 ` Viacheslav Dubeyko
2026-02-11  7:21 ` Sebastian Andrzej Siewior
2026-02-13  7:51 ` Ionut Nechita (Wind River)
2026-02-17 21:52   ` Viacheslav Dubeyko
2026-02-18 19:57     ` Ionut Nechita (Wind River)
2026-02-18 20:04       ` Viacheslav Dubeyko
2026-02-19  9:37         ` Alex Markuze

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox