public inbox for linux-s390@vger.kernel.org
 help / color / mirror / Atom feed
From: Minchan Kim <minchan@kernel.org>
To: akpm@linux-foundation.org
Cc: hca@linux.ibm.com, linux-s390@vger.kernel.org, david@kernel.org,
	mhocko@suse.com, brauner@kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, surenb@google.com,
	timmurray@google.com, Minchan Kim <minchan@google.com>,
	Minchan Kim <minchan@kernel.org>
Subject: [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag
Date: Tue, 21 Apr 2026 16:02:39 -0700	[thread overview]
Message-ID: <20260421230239.172582-4-minchan@kernel.org> (raw)
In-Reply-To: <20260421230239.172582-1-minchan@kernel.org>

Currently, process_mrelease() requires userspace to send a SIGKILL signal
prior to the call. This separation introduces a scheduling race window
where the victim task may receive the signal and enter the exit path
before the reaper can invoke process_mrelease().

When the victim enters the exit path (do_exit -> exit_mm), it clears its
task->mm immediately. This causes process_mrelease() to fail with -ESRCH,
leaving the actual address space teardown (exit_mmap) to be deferred until
the mm's reference count drops to zero. In Android, arbitrary reference counts
(e.g., async I/O, reading /proc/<pid>/cmdline, or various other remote
VM accesses) frequently delay this teardown indefinitely, defeating the
purpose of expedited reclamation.

This delay keeps memory pressure high, forcing the system to unnecessarily
kill additional innocent background apps before the memory from the first
victim is recovered.

This patch introduces the PROCESS_MRELEASE_REAP_KILL UAPI flag to support
an integrated auto-kill mode. When specified, process_mrelease() directly
injects a SIGKILL into the target task.

To solve the race condition deterministically, we grab the mm reference
via mmget() and set the MMF_UNSTABLE flag *before* sending the SIGKILL.
Using mmget() instead of mmgrab() keeps mm_users > 0, preventing the
victim from calling exit_mmap() in its own exit path. This ensures that
the memory is reclaimed synchronously and deterministically by the reaper
in the context of process_mrelease(), avoiding delays caused by
non-deterministic scheduling of the victim task.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/uapi/linux/mman.h |  4 +++
 mm/oom_kill.c             | 56 +++++++++++++++++++++++++++------------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index e89d00528f2f..4266976b45ad 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -56,4 +56,8 @@ struct cachestat {
 	__u64 nr_recently_evicted;
 };
 
+/* Flags for process_mrelease */
+#define PROCESS_MRELEASE_REAP_KILL	(1 << 0)
+#define PROCESS_MRELEASE_VALID_FLAGS	(PROCESS_MRELEASE_REAP_KILL)
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..730ba0d19b53 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
 
 #include <linux/oom.h>
 #include <linux/mm.h>
+#include <uapi/linux/mman.h>
 #include <linux/err.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
@@ -850,7 +851,7 @@ bool oom_killer_disable(signed long timeout)
 	return true;
 }
 
-static inline bool __task_will_free_mem(struct task_struct *task)
+static inline bool __task_will_free_mem(struct task_struct *task, bool ignore_exit)
 {
 	struct signal_struct *sig = task->signal;
 
@@ -862,6 +863,9 @@ static inline bool __task_will_free_mem(struct task_struct *task)
 	if (sig->core_state)
 		return false;
 
+	if (ignore_exit)
+		return true;
+
 	if (sig->flags & SIGNAL_GROUP_EXIT)
 		return true;
 
@@ -878,7 +882,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
  * Caller has to make sure that task->mm is stable (hold task_lock or
  * it operates on the current).
  */
-static bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task, bool ignore_exit)
 {
 	struct mm_struct *mm = task->mm;
 	struct task_struct *p;
@@ -892,7 +896,7 @@ static bool task_will_free_mem(struct task_struct *task)
 	if (!mm)
 		return false;
 
-	if (!__task_will_free_mem(task))
+	if (!__task_will_free_mem(task, ignore_exit))
 		return false;
 
 	/*
@@ -916,7 +920,7 @@ static bool task_will_free_mem(struct task_struct *task)
 			continue;
 		if (same_thread_group(task, p))
 			continue;
-		ret = __task_will_free_mem(p);
+		ret = __task_will_free_mem(p, false);
 		if (!ret)
 			break;
 	}
@@ -1034,7 +1038,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	 * so it can die quickly
 	 */
 	task_lock(victim);
-	if (task_will_free_mem(victim)) {
+	if (task_will_free_mem(victim, false)) {
 		mark_oom_victim(victim);
 		queue_oom_reaper(victim);
 		task_unlock(victim);
@@ -1135,7 +1139,7 @@ bool out_of_memory(struct oom_control *oc)
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
 	 */
-	if (task_will_free_mem(current)) {
+	if (task_will_free_mem(current, false)) {
 		mark_oom_victim(current);
 		queue_oom_reaper(current);
 		return true;
@@ -1217,8 +1221,9 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	unsigned int f_flags;
 	bool reap = false;
 	long ret = 0;
+	bool reap_kill;
 
-	if (flags)
+	if (flags & ~PROCESS_MRELEASE_VALID_FLAGS)
 		return -EINVAL;
 
 	task = pidfd_get_task(pidfd, &f_flags);
@@ -1236,19 +1241,33 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	}
 
 	mm = p->mm;
-	mmgrab(mm);
 
-	if (task_will_free_mem(p))
-		reap = true;
-	else {
-		/* Error only if the work has not been done already */
-		if (!mm_flags_test(MMF_OOM_SKIP, mm))
+	reap_kill = !!(flags & PROCESS_MRELEASE_REAP_KILL);
+	reap = task_will_free_mem(p, reap_kill);
+	if (!reap) {
+		if (reap_kill || !mm_flags_test(MMF_OOM_SKIP, mm))
 			ret = -EINVAL;
+
+		task_unlock(p);
+		goto put_task;
 	}
-	task_unlock(p);
 
-	if (!reap)
-		goto drop_mm;
+	if (reap_kill) {
+		/*
+		 * We use mmget() instead of mmgrab() to keep mm_users > 0,
+		 * preventing the victim from calling exit_mmap() in its
+		 * own exit path. This ensures that the memory is reclaimed
+		 * synchronously and deterministically by the reaper.
+		 */
+		mmget(mm);
+		task_unlock(p);
+		ret = kill_pid(task_tgid(task), SIGKILL, 0);
+		if (ret)
+			goto drop_mm;
+	} else {
+		mmgrab(mm);
+		task_unlock(p);
+	}
 
 	if (mmap_read_lock_killable(mm)) {
 		ret = -EINTR;
@@ -1263,7 +1282,10 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	mmap_read_unlock(mm);
 
 drop_mm:
-	mmdrop(mm);
+	if (reap_kill)
+		mmput(mm);
+	else
+		mmdrop(mm);
 put_task:
 	put_task_struct(task);
 	return ret;
-- 
2.54.0.rc1.555.g9c883467ad-goog


      parent reply	other threads:[~2026-04-21 23:02 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21 23:02 [PATCH v1 0/3] mm: process_mrelease: expedite clean file folio reclaim and add auto-kill Minchan Kim
2026-04-21 23:02 ` [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather Minchan Kim
2026-04-21 23:02 ` [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios Minchan Kim
2026-04-22  7:22   ` Baolin Wang
2026-04-21 23:02 ` Minchan Kim [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260421230239.172582-4-minchan@kernel.org \
    --to=minchan@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=david@kernel.org \
    --cc=hca@linux.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mhocko@suse.com \
    --cc=minchan@google.com \
    --cc=surenb@google.com \
    --cc=timmurray@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox