public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jaegeuk Kim <jaegeuk@kernel.org>
To: Chao Yu <chao@kernel.org>
Cc: linux-f2fs-devel@lists.sourceforge.net, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 01/14] f2fs: add lock elapsed time trace facility for f2fs rwsemphore
Date: Sun, 4 Jan 2026 05:28:03 +0000	[thread overview]
Message-ID: <aVn6YymF08qGPJ3P@google.com> (raw)
In-Reply-To: <20260104020729.1064529-1-chao@kernel.org>

Stuck when disabling checkpoint:

[  403.438233][  T739]  __switch_to+0x164/0x324
[  403.442532][  T739]  __schedule+0xbd0/0x1d28
[  403.446822][  T739]  schedule+0x4c/0x118
[  403.450758][  T739]  schedule_preempt_disabled+0x24/0x44
[  403.456083][  T739]  rwsem_down_write_slowpath+0x438/0x988
[  403.461588][  T739]  down_write+0x10c/0x16c
[  403.465785][  T739]  f2fs_down_write_trace+0x1c/0x70
[  403.470797][  T739]  f2fs_disable_checkpoint+0x114/0x2b8
[  403.476126][  T739]  f2fs_fill_super+0x22c8/0x2728
[  403.480924][  T739]  get_tree_bdev_flags+0x144/0x1dc
[  403.485894][  T739]  get_tree_bdev+0x14/0x24
[  403.490176][  T739]  f2fs_get_tree+0x18/0x28
[  403.494448][  T739]  vfs_get_tree+0x48/0x10c
[  403.498724][  T739]  path_mount+0x62c/0xb74

On 01/04, Chao Yu wrote:
> This patch adds lock elapsed time trace facility for f2fs rwsemphore.
> 
> If total elapsed time of critical region covered by lock exceeds a
> threshold, it will print tracepoint to dump information of lock related
> context, including:
> - thread information
> - CPU/IO priority
> - lock information
> - elapsed time
>  - total time
>  - running time (depend on CONFIG_64BIT)
>  - runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS)
>  - io sleep time (depend on CONFIG_TASK_DELAY_ACCT and
> 		  /proc/sys/kernel/task_delayacct)
>  - other time    (by default other time will account nonio sleep time,
>                   but, if above kconfig is not defined, other time will
>                   include runnable time and/or io sleep time as wll)
> 
> output:
>     f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120, ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, total: 1000, running: 993, runnable: 2, io_sleep: 0, other: 5
> 
> Signed-off-by: Chao Yu <chao@kernel.org>
> ---
>  fs/f2fs/checkpoint.c        | 106 ++++++++++++++++++++++++++++++++++++
>  fs/f2fs/f2fs.h              |  51 +++++++++++++++--
>  include/trace/events/f2fs.h |  68 +++++++++++++++++++++++
>  3 files changed, 221 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 300664269eb6..bc6058a3122b 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -14,6 +14,9 @@
>  #include <linux/pagevec.h>
>  #include <linux/swap.h>
>  #include <linux/kthread.h>
> +#include <linux/delayacct.h>
> +#include <linux/ioprio.h>
> +#include <linux/math64.h>
>  
>  #include "f2fs.h"
>  #include "node.h"
> @@ -21,6 +24,109 @@
>  #include "iostat.h"
>  #include <trace/events/f2fs.h>
>  
> +static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
> +{
> +	ts->total_time = ktime_get();
> +#ifdef CONFIG_64BIT
> +	ts->running_time = current->se.sum_exec_runtime;
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +	ts->runnable_time = current->sched_info.run_delay;
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +	if (current->delays)
> +		ts->io_sleep_time = current->delays->blkio_delay;
> +#endif
> +}
> +
> +static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
> +						struct f2fs_lock_context *lc)
> +{
> +	lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
> +	if (!lc->lock_trace)
> +		return;
> +
> +	get_lock_elapsed_time(&lc->ts);
> +}
> +
> +static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
> +				struct f2fs_lock_context *lc, bool is_write)
> +{
> +	struct f2fs_time_stat tts;
> +	unsigned long long total_time;
> +	unsigned long long running_time = 0;
> +	unsigned long long runnable_time = 0;
> +	unsigned long long io_sleep_time = 0;
> +	unsigned long long other_time = 0;
> +	unsigned npm = NSEC_PER_MSEC;
> +
> +	if (!lc->lock_trace)
> +		return;
> +
> +	get_lock_elapsed_time(&tts);
> +
> +	total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
> +	if (total_time <= MAX_LOCK_ELAPSED_TIME)
> +		return;
> +
> +#ifdef CONFIG_64BIT
> +	running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +	runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +	io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
> +#endif
> +	if (total_time > running_time + io_sleep_time + runnable_time)
> +		other_time = total_time - running_time -
> +				io_sleep_time - runnable_time;
> +
> +	trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
> +			get_current_ioprio(), total_time, running_time,
> +			runnable_time, io_sleep_time, other_time);
> +}
> +
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	f2fs_down_read(sem);
> +	trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	if (!f2fs_down_read_trylock(sem))
> +		return 0;
> +	trace_lock_elapsed_time_start(sem, lc);
> +	return 1;
> +}
> +
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	f2fs_up_read(sem);
> +	trace_lock_elapsed_time_end(sem, lc, false);
> +}
> +
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	f2fs_down_write(sem);
> +	trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	if (!f2fs_down_write_trylock(sem))
> +		return 0;
> +	trace_lock_elapsed_time_start(sem, lc);
> +	return 1;
> +}
> +
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +	f2fs_up_write(sem);
> +	trace_lock_elapsed_time_end(sem, lc, true);
> +}
> +
>  #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
>  
>  static struct kmem_cache *ino_entry_slab;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c3e1611fce18..7e22315dbedd 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -173,6 +173,10 @@ enum device_allocation_policy {
>  	ALLOCATE_FORWARD_FROM_HINT,
>  };
>  
> +enum f2fs_lock_name {
> +	LOCK_NAME_NONE,
> +};
> +
>  /*
>   * An implementation of an rwsem that is explicitly unfair to readers. This
>   * prevents priority inversion when a low-priority reader acquires the read lock
> @@ -181,6 +185,8 @@ enum device_allocation_policy {
>   */
>  
>  struct f2fs_rwsem {
> +	struct f2fs_sb_info *sbi;
> +	enum f2fs_lock_name name;
>          struct rw_semaphore internal_rwsem;
>  #ifdef CONFIG_F2FS_UNFAIR_RWSEM
>          wait_queue_head_t read_waiters;
> @@ -1409,6 +1415,24 @@ struct f2fs_gc_control {
>  	unsigned int nr_free_secs;	/* # of free sections to do GC */
>  };
>  
> +struct f2fs_time_stat {
> +	unsigned long long total_time;		/* total wall clock time */
> +#ifdef CONFIG_64BIT
> +	unsigned long long running_time;	/* running time */
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +	unsigned long long runnable_time;	/* runnable(including preempted) time */
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +	unsigned long long io_sleep_time;	/* IO sleep time */
> +#endif
> +};
> +
> +struct f2fs_lock_context {
> +	struct f2fs_time_stat ts;
> +	bool lock_trace;
> +};
> +
>  /*
>   * For s_flag in struct f2fs_sb_info
>   * Modification on enum should be synchronized with s_flag array
> @@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode {
>  	LOOKUP_AUTO,
>  };
>  
> +/* a threshold of maximum elapsed time in critical region to print tracepoint */
> +#define MAX_LOCK_ELAPSED_TIME		500
> +
>  static inline int f2fs_test_bit(unsigned int nr, char *addr);
>  static inline void f2fs_set_bit(unsigned int nr, char *addr);
>  static inline void f2fs_clear_bit(unsigned int nr, char *addr);
> @@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
>  	spin_unlock_irqrestore(&sbi->cp_lock, flags);
>  }
>  
> -#define init_f2fs_rwsem(sem)					\
> +#define init_f2fs_rwsem(sem)	__init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
> +#define init_f2fs_rwsem_trace	__init_f2fs_rwsem
> +
> +#define __init_f2fs_rwsem(sem, sbi, name)			\
>  do {								\
>  	static struct lock_class_key __key;			\
>  								\
> -	__init_f2fs_rwsem((sem), #sem, &__key);			\
> +	do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name);	\
>  } while (0)
>  
> -static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
> -		const char *sem_name, struct lock_class_key *key)
> +static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
> +		const char *sem_name, struct lock_class_key *key,
> +		struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
>  {
> +	sem->sbi = sbi;
> +	sem->name = name;
>  	__init_rwsem(&sem->internal_rwsem, sem_name, key);
>  #ifdef CONFIG_F2FS_UNFAIR_RWSEM
>  	init_waitqueue_head(&sem->read_waiters);
> @@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
>  #endif
>  }
>  
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
> +						struct f2fs_lock_context *lc);
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem,
> +						struct f2fs_lock_context *lc);
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
> +						struct f2fs_lock_context *lc);
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +
>  static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
>  {
>  	unsigned long flags;
> diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
> index 635dcabcf1c7..9a852a16df9c 100644
> --- a/include/trace/events/f2fs.h
> +++ b/include/trace/events/f2fs.h
> @@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
>  		{ CP_PHASE_FINISH_BLOCK_OPS,	"finish block_ops" },			\
>  		{ CP_PHASE_FINISH_CHECKPOINT,	"finish checkpoint" })
>  
> +#define show_lock_name(lock)						\
> +	__print_symbolic(lock,						\
> +		{ LOCK_NAME_NONE,		"none" })
> +
>  struct f2fs_sb_info;
>  struct f2fs_io_info;
>  struct extent_info;
> @@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
>  	TP_ARGS(inode, offset, bytes)
>  );
>  
> +TRACE_EVENT(f2fs_lock_elapsed_time,
> +
> +	TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
> +		bool is_write, struct task_struct *p, int ioprio,
> +		unsigned long long total_time,
> +		unsigned long long running_time,
> +		unsigned long long runnable_time,
> +		unsigned long long io_sleep_time,
> +		unsigned long long other_time),
> +
> +	TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
> +		runnable_time, io_sleep_time, other_time),
> +
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__array(char, comm, TASK_COMM_LEN)
> +		__field(pid_t, pid)
> +		__field(int, prio)
> +		__field(int, ioprio_class)
> +		__field(int, ioprio_data)
> +		__field(unsigned int, lock_name)
> +		__field(bool, is_write)
> +		__field(unsigned long long, total_time)
> +		__field(unsigned long long, running_time)
> +		__field(unsigned long long, runnable_time)
> +		__field(unsigned long long, io_sleep_time)
> +		__field(unsigned long long, other_time)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= sbi->sb->s_dev;
> +		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
> +		__entry->pid		= p->pid;
> +		__entry->prio		= p->prio;
> +		__entry->ioprio_class	= IOPRIO_PRIO_CLASS(ioprio);
> +		__entry->ioprio_data	= IOPRIO_PRIO_DATA(ioprio);
> +		__entry->lock_name	= lock_name;
> +		__entry->is_write	= is_write;
> +		__entry->total_time	= total_time;
> +		__entry->running_time	= running_time;
> +		__entry->runnable_time	= runnable_time;
> +		__entry->io_sleep_time	= io_sleep_time;
> +		__entry->other_time	= other_time;
> +	),
> +
> +	TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
> +		"ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
> +		"lock_type: %s, total: %llu, running: %llu, "
> +		"runnable: %llu, io_sleep: %llu, other: %llu",
> +		show_dev(__entry->dev),
> +		__entry->comm,
> +		__entry->pid,
> +		__entry->prio,
> +		__entry->ioprio_class,
> +		__entry->ioprio_data,
> +		show_lock_name(__entry->lock_name),
> +		__entry->is_write ? "wlock" : "rlock",
> +		__entry->total_time,
> +		__entry->running_time,
> +		__entry->runnable_time,
> +		__entry->io_sleep_time,
> +		__entry->other_time)
> +);
> +
>  #endif /* _TRACE_F2FS_H */
>  
>   /* This part must be outside protection */
> -- 
> 2.49.0

  parent reply	other threads:[~2026-01-04  5:28 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-04  2:07 [PATCH 01/14] f2fs: add lock elapsed time trace facility for f2fs rwsemphore Chao Yu
2026-01-04  2:07 ` [PATCH 02/14] f2fs: sysfs: introduce max_lock_elapsed_time Chao Yu
2026-01-04  2:07 ` [PATCH 03/14] f2fs: trace elapsed time for cp_rwsem lock Chao Yu
2026-01-04  2:07 ` [PATCH 04/14] f2fs: trace elapsed time for node_change lock Chao Yu
2026-01-04  2:07 ` [PATCH 05/14] f2fs: trace elapsed time for node_write lock Chao Yu
2026-01-04  2:07 ` [PATCH 06/14] f2fs: trace elapsed time for gc_lock lock Chao Yu
2026-01-04  5:35   ` Jaegeuk Kim
2026-01-04  5:42     ` Jaegeuk Kim
2026-01-04  6:27       ` Chao Yu
2026-01-04  2:07 ` [PATCH 07/14] f2fs: trace elapsed time for cp_global_sem lock Chao Yu
2026-01-04  2:07 ` [PATCH 08/14] f2fs: trace elapsed time for io_rwsem lock Chao Yu
2026-01-04  2:07 ` [PATCH 09/14] f2fs: clean up w/ __f2fs_schedule_timeout() Chao Yu
2026-01-04  2:07 ` [PATCH 10/14] f2fs: fix to use jiffies based precision for DEFAULT_SCHEDULE_TIMEOUT Chao Yu
2026-01-04  2:07 ` [PATCH 11/14] f2fs: fix timeout precision of f2fs_io_schedule_timeout_killable() Chao Yu
2026-01-04  2:07 ` [PATCH 12/14] f2fs: rename FAULT_TIMEOUT to FAULT_ATOMIC_TIMEOUT Chao Yu
2026-01-04  2:07 ` [PATCH 13/14] f2fs: introduce FAULT_LOCK_TIMEOUT Chao Yu
2026-01-04  2:07 ` [PATCH 14/14] f2fs: sysfs: introduce inject_lock_timeout Chao Yu
2026-01-04  5:28 ` Jaegeuk Kim [this message]
2026-01-07  3:30 ` [f2fs-dev] [PATCH 01/14] f2fs: add lock elapsed time trace facility for f2fs rwsemphore patchwork-bot+f2fs

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aVn6YymF08qGPJ3P@google.com \
    --to=jaegeuk@kernel.org \
    --cc=chao@kernel.org \
    --cc=linux-f2fs-devel@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox