linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/2] futex: Add check_robust_futex to verify process usage of robust_futex
@ 2025-08-04  3:03 zhongjinji
  2025-08-04  3:03 ` [PATCH v3 2/2] mm/oom_kill: Only delay OOM reaper for processes using robust futex zhongjinji
  0 siblings, 1 reply; 3+ messages in thread
From: zhongjinji @ 2025-08-04  3:03 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, mhocko, rientjes, shakeel.butt, npache, linux-kernel, tglx,
	mingo, peterz, dvhart, dave, andrealmeid, liulu.liu, feng.han

From: zhongjinji <zhongjinji@honor.com>

The check_robust_futex function is added to detect whether a process uses
robust_futex.

According to the patch discussion
(https://lore.kernel.org/all/20220414144042.677008-1-npache@redhat.com/T/#u),
executing the OOM reaper too early on processes using robust_futex may cause
the lock holder to wait indefinitely.

Therefore, this patch introduces check_robust_futex to identify such
processes during OOM reaper execution, and delays the OOM reaper specifically
for processes using robust_futex.

Signed-off-by: zhongjinji <zhongjinji@honor.com>
---
 include/linux/futex.h | 11 ++++++++++-
 kernel/futex/core.c   | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 9e9750f04980..c879300d4ff8 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -81,7 +81,8 @@ void futex_exec_release(struct task_struct *tsk);
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
-
+bool check_robust_futex(struct task_struct *p);
+bool check_robust_futex_rcu(struct task_struct *p);
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 int futex_hash_allocate_default(void);
 void futex_hash_free(struct mm_struct *mm);
@@ -108,6 +109,14 @@ static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsig
 {
 	return -EINVAL;
 }
+static inline bool check_robust_futex(struct task_struct *p)
+{
+	return false;
+}
+static inline bool check_robust_futex_rcu(struct task_struct *p)
+{
+	return false;
+}
 static inline int futex_hash_allocate_default(void)
 {
 	return 0;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d9bb5567af0c..c9d8a1947bf1 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1513,6 +1513,31 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
+bool check_robust_futex(struct task_struct *p)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		if (unlikely(t->robust_list))
+			return true;
+#ifdef CONFIG_COMPAT
+		if (unlikely(t->compat_robust_list))
+			return true;
+#endif
+	}
+	return false;
+}
+
+bool check_robust_futex_rcu(struct task_struct *p)
+{
+	bool has_robust;
+
+	rcu_read_lock();
+	has_robust = check_robust_futex(p);
+	rcu_read_unlock();
+	return has_robust;
+}
+
 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 				   struct futex_private_hash *fph)
 {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH v3 2/2] mm/oom_kill: Only delay OOM reaper for processes using robust futex
  2025-08-04  3:03 [PATCH v3 1/2] futex: Add check_robust_futex to verify process usage of robust_futex zhongjinji
@ 2025-08-04  3:03 ` zhongjinji
  2025-08-04  6:04   ` Michal Hocko
  0 siblings, 1 reply; 3+ messages in thread
From: zhongjinji @ 2025-08-04  3:03 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, mhocko, rientjes, shakeel.butt, npache, linux-kernel, tglx,
	mingo, peterz, dvhart, dave, andrealmeid, liulu.liu, feng.han

From: zhongjinji <zhongjinji@honor.com>

After merging the patch
https://lore.kernel.org/all/20220414144042.677008-1-npache@redhat.com/T/#u,
the OOM reaper runs less frequently because many processes exit within 2 seconds.

However, when a process is killed, timely handling by the OOM reaper allows
its memory to be freed faster.

Since relatively few processes use robust futex, delaying the OOM reaper for
all processes is undesirable, as many killed processes cannot release memory
more quickly.

This patch modifies the behavior so that only processes using robust futex
are delayed by the OOM reaper, allowing the OOM reaper to handle more
processes in a timely manner.

Signed-off-by: zhongjinji <zhongjinji@honor.com>
---
 mm/oom_kill.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..c558ac93ae7d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/futex.h>
 #include <linux/cpuset.h>
 #include <linux/export.h>
 #include <linux/notifier.h>
@@ -692,7 +693,7 @@ static void wake_oom_reaper(struct timer_list *timer)
  * before the exit path is able to wake the futex waiters.
  */
 #define OOM_REAPER_DELAY (2*HZ)
-static void queue_oom_reaper(struct task_struct *tsk)
+static void queue_oom_reaper(struct task_struct *tsk, bool delay)
 {
 	/* mm is already queued? */
 	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
@@ -700,7 +701,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
 
 	get_task_struct(tsk);
 	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
-	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+	tsk->oom_reaper_timer.expires = jiffies + (delay ? OOM_REAPER_DELAY : 0);
 	add_timer(&tsk->oom_reaper_timer);
 }
 
@@ -742,7 +743,7 @@ static int __init oom_init(void)
 }
 subsys_initcall(oom_init)
 #else
-static inline void queue_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk, bool delay)
 {
 }
 #endif /* CONFIG_MMU */
@@ -871,11 +872,12 @@ static inline bool __task_will_free_mem(struct task_struct *task)
  * Caller has to make sure that task->mm is stable (hold task_lock or
  * it operates on the current).
  */
-static bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task, bool *delay_reap)
 {
 	struct mm_struct *mm = task->mm;
 	struct task_struct *p;
 	bool ret = true;
+	bool has_robust = !delay_reap;
 
 	/*
 	 * Skip tasks without mm because it might have passed its exit_mm and
@@ -888,6 +890,15 @@ static bool task_will_free_mem(struct task_struct *task)
 	if (!__task_will_free_mem(task))
 		return false;
 
+	/*
+	 * Check if a process is using robust futexes. If so, delay its handling by the
+	 * OOM reaper. The reason is that if the owner of a robust futex lock is killed
+	 * while waiters are still alive, the OOM reaper might free the robust futex
+	 * resources before futex_cleanup runs, causing the waiters to wait indefinitely.
+	 */
+	if (!has_robust)
+		has_robust = check_robust_futex_rcu(task);
+
 	/*
 	 * This task has already been drained by the oom reaper so there are
 	 * only small chances it will free some more
@@ -912,8 +923,12 @@ static bool task_will_free_mem(struct task_struct *task)
 		ret = __task_will_free_mem(p);
 		if (!ret)
 			break;
+		if (!has_robust)
+			has_robust = check_robust_futex(p);
 	}
 	rcu_read_unlock();
+	if (delay_reap)
+		*delay_reap = has_robust;
 
 	return ret;
 }
@@ -923,6 +938,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
 	struct task_struct *p;
 	struct mm_struct *mm;
 	bool can_oom_reap = true;
+	bool delay_reap;
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
@@ -950,6 +966,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
 	 * reserves from the user space under its control.
 	 */
 	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
+	delay_reap = check_robust_futex_rcu(victim);
 	mark_oom_victim(victim);
 	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
 		message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
@@ -990,11 +1007,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
 		if (unlikely(p->flags & PF_KTHREAD))
 			continue;
 		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+		if (!delay_reap)
+			delay_reap = check_robust_futex(p);
 	}
 	rcu_read_unlock();
 
 	if (can_oom_reap)
-		queue_oom_reaper(victim);
+		queue_oom_reaper(victim, delay_reap);
 
 	mmdrop(mm);
 	put_task_struct(victim);
@@ -1020,6 +1039,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	struct mem_cgroup *oom_group;
 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 					      DEFAULT_RATELIMIT_BURST);
+	bool delay_reap = false;
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -1027,9 +1047,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	 * so it can die quickly
 	 */
 	task_lock(victim);
-	if (task_will_free_mem(victim)) {
+	if (task_will_free_mem(victim, &delay_reap)) {
 		mark_oom_victim(victim);
-		queue_oom_reaper(victim);
+		queue_oom_reaper(victim, delay_reap);
 		task_unlock(victim);
 		put_task_struct(victim);
 		return;
@@ -1112,6 +1132,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
+	bool delay_reap = false;
 
 	if (oom_killer_disabled)
 		return false;
@@ -1128,9 +1149,9 @@ bool out_of_memory(struct oom_control *oc)
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
 	 */
-	if (task_will_free_mem(current)) {
+	if (task_will_free_mem(current, &delay_reap)) {
 		mark_oom_victim(current);
-		queue_oom_reaper(current);
+		queue_oom_reaper(current, delay_reap);
 		return true;
 	}
 
@@ -1231,7 +1252,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	mm = p->mm;
 	mmgrab(mm);
 
-	if (task_will_free_mem(p))
+	if (task_will_free_mem(p, NULL))
 		reap = true;
 	else {
 		/* Error only if the work has not been done already */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3 2/2] mm/oom_kill: Only delay OOM reaper for processes using robust futex
  2025-08-04  3:03 ` [PATCH v3 2/2] mm/oom_kill: Only delay OOM reaper for processes using robust futex zhongjinji
@ 2025-08-04  6:04   ` Michal Hocko
  0 siblings, 0 replies; 3+ messages in thread
From: Michal Hocko @ 2025-08-04  6:04 UTC (permalink / raw)
  To: zhongjinji
  Cc: linux-mm, akpm, rientjes, shakeel.butt, npache, linux-kernel,
	tglx, mingo, peterz, dvhart, dave, andrealmeid, liulu.liu,
	feng.han

I have only noticed this now. Did you have any reason to repost v3
without any prior feedback on v2 and without any changelog from v2?

On Mon 04-08-25 11:03:41, zhongjinji@honor.com wrote:
> From: zhongjinji <zhongjinji@honor.com>
> 
> After merging the patch
> https://lore.kernel.org/all/20220414144042.677008-1-npache@redhat.com/T/#u,
> the OOM reaper runs less frequently because many processes exit within 2 seconds.
> 
> However, when a process is killed, timely handling by the OOM reaper allows
> its memory to be freed faster.
> 
> Since relatively few processes use robust futex, delaying the OOM reaper for
> all processes is undesirable, as many killed processes cannot release memory
> more quickly.
> 
> This patch modifies the behavior so that only processes using robust futex
> are delayed by the OOM reaper, allowing the OOM reaper to handle more
> processes in a timely manner.
> 
> Signed-off-by: zhongjinji <zhongjinji@honor.com>
> ---
>  mm/oom_kill.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 25923cfec9c6..c558ac93ae7d 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -30,6 +30,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/timex.h>
>  #include <linux/jiffies.h>
> +#include <linux/futex.h>
>  #include <linux/cpuset.h>
>  #include <linux/export.h>
>  #include <linux/notifier.h>
> @@ -692,7 +693,7 @@ static void wake_oom_reaper(struct timer_list *timer)
>   * before the exit path is able to wake the futex waiters.
>   */
>  #define OOM_REAPER_DELAY (2*HZ)
> -static void queue_oom_reaper(struct task_struct *tsk)
> +static void queue_oom_reaper(struct task_struct *tsk, bool delay)
>  {
>  	/* mm is already queued? */
>  	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
> @@ -700,7 +701,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
>  
>  	get_task_struct(tsk);
>  	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
> -	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
> +	tsk->oom_reaper_timer.expires = jiffies + (delay ? OOM_REAPER_DELAY : 0);
>  	add_timer(&tsk->oom_reaper_timer);
>  }
>  
> @@ -742,7 +743,7 @@ static int __init oom_init(void)
>  }
>  subsys_initcall(oom_init)
>  #else
> -static inline void queue_oom_reaper(struct task_struct *tsk)
> +static inline void queue_oom_reaper(struct task_struct *tsk, bool delay)
>  {
>  }
>  #endif /* CONFIG_MMU */
> @@ -871,11 +872,12 @@ static inline bool __task_will_free_mem(struct task_struct *task)
>   * Caller has to make sure that task->mm is stable (hold task_lock or
>   * it operates on the current).
>   */
> -static bool task_will_free_mem(struct task_struct *task)
> +static bool task_will_free_mem(struct task_struct *task, bool *delay_reap)
>  {
>  	struct mm_struct *mm = task->mm;
>  	struct task_struct *p;
>  	bool ret = true;
> +	bool has_robust = !delay_reap;
>  
>  	/*
>  	 * Skip tasks without mm because it might have passed its exit_mm and
> @@ -888,6 +890,15 @@ static bool task_will_free_mem(struct task_struct *task)
>  	if (!__task_will_free_mem(task))
>  		return false;
>  
> +	/*
> +	 * Check if a process is using robust futexes. If so, delay its handling by the
> +	 * OOM reaper. The reason is that if the owner of a robust futex lock is killed
> +	 * while waiters are still alive, the OOM reaper might free the robust futex
> +	 * resources before futex_cleanup runs, causing the waiters to wait indefinitely.
> +	 */
> +	if (!has_robust)
> +		has_robust = check_robust_futex_rcu(task);
> +
>  	/*
>  	 * This task has already been drained by the oom reaper so there are
>  	 * only small chances it will free some more
> @@ -912,8 +923,12 @@ static bool task_will_free_mem(struct task_struct *task)
>  		ret = __task_will_free_mem(p);
>  		if (!ret)
>  			break;
> +		if (!has_robust)
> +			has_robust = check_robust_futex(p);
>  	}
>  	rcu_read_unlock();
> +	if (delay_reap)
> +		*delay_reap = has_robust;
>  
>  	return ret;
>  }
> @@ -923,6 +938,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
>  	struct task_struct *p;
>  	struct mm_struct *mm;
>  	bool can_oom_reap = true;
> +	bool delay_reap;
>  
>  	p = find_lock_task_mm(victim);
>  	if (!p) {
> @@ -950,6 +966,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
>  	 * reserves from the user space under its control.
>  	 */
>  	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
> +	delay_reap = check_robust_futex_rcu(victim);
>  	mark_oom_victim(victim);
>  	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
>  		message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
> @@ -990,11 +1007,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
>  		if (unlikely(p->flags & PF_KTHREAD))
>  			continue;
>  		do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
> +		if (!delay_reap)
> +			delay_reap = check_robust_futex(p);
>  	}
>  	rcu_read_unlock();
>  
>  	if (can_oom_reap)
> -		queue_oom_reaper(victim);
> +		queue_oom_reaper(victim, delay_reap);
>  
>  	mmdrop(mm);
>  	put_task_struct(victim);
> @@ -1020,6 +1039,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
>  	struct mem_cgroup *oom_group;
>  	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
>  					      DEFAULT_RATELIMIT_BURST);
> +	bool delay_reap = false;
>  
>  	/*
>  	 * If the task is already exiting, don't alarm the sysadmin or kill
> @@ -1027,9 +1047,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
>  	 * so it can die quickly
>  	 */
>  	task_lock(victim);
> -	if (task_will_free_mem(victim)) {
> +	if (task_will_free_mem(victim, &delay_reap)) {
>  		mark_oom_victim(victim);
> -		queue_oom_reaper(victim);
> +		queue_oom_reaper(victim, delay_reap);
>  		task_unlock(victim);
>  		put_task_struct(victim);
>  		return;
> @@ -1112,6 +1132,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>  bool out_of_memory(struct oom_control *oc)
>  {
>  	unsigned long freed = 0;
> +	bool delay_reap = false;
>  
>  	if (oom_killer_disabled)
>  		return false;
> @@ -1128,9 +1149,9 @@ bool out_of_memory(struct oom_control *oc)
>  	 * select it.  The goal is to allow it to allocate so that it may
>  	 * quickly exit and free its memory.
>  	 */
> -	if (task_will_free_mem(current)) {
> +	if (task_will_free_mem(current, &delay_reap)) {
>  		mark_oom_victim(current);
> -		queue_oom_reaper(current);
> +		queue_oom_reaper(current, delay_reap);
>  		return true;
>  	}
>  
> @@ -1231,7 +1252,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
>  	mm = p->mm;
>  	mmgrab(mm);
>  
> -	if (task_will_free_mem(p))
> +	if (task_will_free_mem(p, NULL))
>  		reap = true;
>  	else {
>  		/* Error only if the work has not been done already */
> -- 
> 2.17.1

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-08-04  6:04 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-04  3:03 [PATCH v3 1/2] futex: Add check_robust_futex to verify process usage of robust_futex zhongjinji
2025-08-04  3:03 ` [PATCH v3 2/2] mm/oom_kill: Only delay OOM reaper for processes using robust futex zhongjinji
2025-08-04  6:04   ` Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).