* [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-21 23:02 [PATCH v1 0/3] mm: process_mrelease: expedite clean file folio reclaim and add auto-kill Minchan Kim
@ 2026-04-21 23:02 ` Minchan Kim
2026-04-24 7:56 ` David Hildenbrand (Arm)
2026-04-24 19:33 ` Matthew Wilcox
2026-04-21 23:02 ` [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios Minchan Kim
2026-04-21 23:02 ` [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag Minchan Kim
2 siblings, 2 replies; 21+ messages in thread
From: Minchan Kim @ 2026-04-21 23:02 UTC (permalink / raw)
To: akpm
Cc: hca, linux-s390, david, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim, Minchan Kim
Currently, process_mrelease() unmaps pages but file-backed pages are
not evicted and stay in the pagecache, relying on standard memory reclaim
(kswapd or direct reclaim) to eventually free them. This delays the
immediate recovery of system memory under Android's LMKD scenarios,
leading to redundant background apps kills.
This patch implements an expedited eviction mechanism for clean pagecache
folios in the mmu_gather code, similar to how swapcache folios are handled.
It drops them from the pagecache (i.e., evicting them) if they are completely
unmapped during reaping.
Within this single unified loop, anonymous pages are released via
free_swap_cache(), and file-backed folios are symmetrically released via
free_file_cache().
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
arch/s390/include/asm/tlb.h | 2 +-
include/linux/swap.h | 5 ++---
mm/mmu_gather.c | 7 ++++---
mm/swap.c | 42 +++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 26 -----------------------
5 files changed, 49 insertions(+), 33 deletions(-)
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 619fd41e710e..2736dbb571a8 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -62,7 +62,7 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
VM_WARN_ON_ONCE(delay_rmap);
VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
- free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
+ free_pages_and_caches(tlb->mm, encoded_pages, ARRAY_SIZE(encoded_pages));
return false;
}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..bdb784966343 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -414,7 +414,9 @@ extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#endif
+struct mm_struct;
void check_move_unevictable_folios(struct folio_batch *fbatch);
+void free_pages_and_caches(struct mm_struct *mm, struct encoded_page **pages, int nr);
extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);
@@ -433,7 +435,6 @@ static inline unsigned long total_swapcache_pages(void)
void free_swap_cache(struct folio *folio);
void free_folio_and_swap_cache(struct folio *folio);
-void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
@@ -510,8 +511,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
#define free_folio_and_swap_cache(folio) \
folio_put(folio)
-#define free_pages_and_swap_cache(pages, nr) \
- release_pages((pages), (nr));
static inline void free_swap_cache(struct folio *folio)
{
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index fe5b6a031717..3c6c315d3c48 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -100,7 +100,8 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
*/
#define MAX_NR_FOLIOS_PER_FREE 512
-static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
+static void __tlb_batch_free_encoded_pages(struct mm_struct *mm,
+ struct mmu_gather_batch *batch)
{
struct encoded_page **pages = batch->encoded_pages;
unsigned int nr, nr_pages;
@@ -135,7 +136,7 @@ static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
}
}
- free_pages_and_swap_cache(pages, nr);
+ free_pages_and_caches(mm, pages, nr);
pages += nr;
batch->nr -= nr;
@@ -148,7 +149,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
- __tlb_batch_free_encoded_pages(batch);
+ __tlb_batch_free_encoded_pages(tlb->mm, batch);
tlb->active = &tlb->local;
}
diff --git a/mm/swap.c b/mm/swap.c
index bb19ccbece46..e44bc8cefceb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
}
EXPORT_SYMBOL(release_pages);
+static inline void free_file_cache(struct folio *folio)
+{
+ if (folio_trylock(folio)) {
+ mapping_evict_folio(folio_mapping(folio), folio);
+ folio_unlock(folio);
+ }
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them. They are removed from the LRU and freed if this is their last use.
+ *
+ * If @try_evict_file_folios is true, this function will proactively evict clean
+ * file-backed folios if they are no longer mapped.
+ */
+void free_pages_and_caches(struct mm_struct *mm, struct encoded_page **pages, int nr)
+{
+ bool try_evict_file_folios = mm_flags_test(MMF_UNSTABLE, mm);
+ struct folio_batch folios;
+ unsigned int refs[PAGEVEC_SIZE];
+
+ folio_batch_init(&folios);
+ for (int i = 0; i < nr; i++) {
+ struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+
+ if (folio_test_anon(folio))
+ free_swap_cache(folio);
+ else if (unlikely(try_evict_file_folios))
+ free_file_cache(folio);
+
+ refs[folios.nr] = 1;
+ if (unlikely(encoded_page_flags(pages[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ refs[folios.nr] = encoded_nr_pages(pages[++i]);
+
+ if (folio_batch_add(&folios, folio) == 0)
+ folios_put_refs(&folios, refs);
+ }
+ if (folios.nr)
+ folios_put_refs(&folios, refs);
+}
+
/*
* The folios which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d0eef7470be..7576bf36d920 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -400,32 +400,6 @@ void free_folio_and_swap_cache(struct folio *folio)
folio_put(folio);
}
-/*
- * Passed an array of pages, drop them all from swapcache and then release
- * them. They are removed from the LRU and freed if this is their last use.
- */
-void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
-{
- struct folio_batch folios;
- unsigned int refs[PAGEVEC_SIZE];
-
- folio_batch_init(&folios);
- for (int i = 0; i < nr; i++) {
- struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
-
- free_swap_cache(folio);
- refs[folios.nr] = 1;
- if (unlikely(encoded_page_flags(pages[i]) &
- ENCODED_PAGE_BIT_NR_PAGES_NEXT))
- refs[folios.nr] = encoded_nr_pages(pages[++i]);
-
- if (folio_batch_add(&folios, folio) == 0)
- folios_put_refs(&folios, refs);
- }
- if (folios.nr)
- folios_put_refs(&folios, refs);
-}
-
static inline bool swap_use_vma_readahead(void)
{
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
--
2.54.0.rc1.555.g9c883467ad-goog
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-21 23:02 ` [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather Minchan Kim
@ 2026-04-24 7:56 ` David Hildenbrand (Arm)
2026-04-24 21:24 ` Minchan Kim
2026-04-24 19:33 ` Matthew Wilcox
1 sibling, 1 reply; 21+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-24 7:56 UTC (permalink / raw)
To: Minchan Kim, akpm
Cc: hca, linux-s390, mhocko, brauner, linux-mm, linux-kernel, surenb,
timmurray, Minchan Kim
On 4/22/26 01:02, Minchan Kim wrote:
Can we make the subject easier to understand?
"mm: process_mrelease: evict clean file folios when reaping a process"
> Currently, process_mrelease() unmaps pages but file-backed pages are
> not evicted and stay in the pagecache, relying on standard memory reclaim
> (kswapd or direct reclaim) to eventually free them. This delays the
> immediate recovery of system memory under Android's LMKD scenarios,
> leading to redundant background apps kills.
>
> This patch implements an expedited eviction mechanism for clean pagecache
> folios in the mmu_gather code, similar to how swapcache folios are handled.
> It drops them from the pagecache (i.e., evicting them) if they are completely
> unmapped during reaping.
>
> Within this single unified loop, anonymous pages are released via
> free_swap_cache(), and file-backed folios are symmetrically released via
> free_file_cache().
>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
> arch/s390/include/asm/tlb.h | 2 +-
> include/linux/swap.h | 5 ++---
> mm/mmu_gather.c | 7 ++++---
> mm/swap.c | 42 +++++++++++++++++++++++++++++++++++++
> mm/swap_state.c | 26 -----------------------
> 5 files changed, 49 insertions(+), 33 deletions(-)
>
> diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
> index 619fd41e710e..2736dbb571a8 100644
> --- a/arch/s390/include/asm/tlb.h
> +++ b/arch/s390/include/asm/tlb.h
> @@ -62,7 +62,7 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
> VM_WARN_ON_ONCE(delay_rmap);
> VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
>
> - free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
> + free_pages_and_caches(tlb->mm, encoded_pages, ARRAY_SIZE(encoded_pages));
> return false;
> }
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 62fc7499b408..bdb784966343 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -414,7 +414,9 @@ extern int sysctl_min_unmapped_ratio;
> extern int sysctl_min_slab_ratio;
> #endif
>
> +struct mm_struct;
> void check_move_unevictable_folios(struct folio_batch *fbatch);
> +void free_pages_and_caches(struct mm_struct *mm, struct encoded_page **pages, int nr);
>
> extern void __meminit kswapd_run(int nid);
> extern void __meminit kswapd_stop(int nid);
> @@ -433,7 +435,6 @@ static inline unsigned long total_swapcache_pages(void)
>
> void free_swap_cache(struct folio *folio);
> void free_folio_and_swap_cache(struct folio *folio);
> -void free_pages_and_swap_cache(struct encoded_page **, int);
> /* linux/mm/swapfile.c */
> extern atomic_long_t nr_swap_pages;
> extern long total_swap_pages;
> @@ -510,8 +511,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
> do { (val)->freeswap = (val)->totalswap = 0; } while (0)
> #define free_folio_and_swap_cache(folio) \
> folio_put(folio)
> -#define free_pages_and_swap_cache(pages, nr) \
> - release_pages((pages), (nr));
>
> static inline void free_swap_cache(struct folio *folio)
> {
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index fe5b6a031717..3c6c315d3c48 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -100,7 +100,8 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
> */
> #define MAX_NR_FOLIOS_PER_FREE 512
>
> -static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
> +static void __tlb_batch_free_encoded_pages(struct mm_struct *mm,
> + struct mmu_gather_batch *batch)
> {
> struct encoded_page **pages = batch->encoded_pages;
> unsigned int nr, nr_pages;
> @@ -135,7 +136,7 @@ static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
> }
> }
>
> - free_pages_and_swap_cache(pages, nr);
> + free_pages_and_caches(mm, pages, nr);
> pages += nr;
> batch->nr -= nr;
>
> @@ -148,7 +149,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
> struct mmu_gather_batch *batch;
>
> for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
> - __tlb_batch_free_encoded_pages(batch);
> + __tlb_batch_free_encoded_pages(tlb->mm, batch);
> tlb->active = &tlb->local;
> }
>
> diff --git a/mm/swap.c b/mm/swap.c
> index bb19ccbece46..e44bc8cefceb 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
> }
> EXPORT_SYMBOL(release_pages);
>
> +static inline void free_file_cache(struct folio *folio)
> +{
> + if (folio_trylock(folio)) {
> + mapping_evict_folio(folio_mapping(folio), folio);
> + folio_unlock(folio);
> + }
> +}
> +
> +/*
> + * Passed an array of pages, drop them all from swapcache and then release
> + * them. They are removed from the LRU and freed if this is their last use.
> + *
> + * If @try_evict_file_folios is true, this function will proactively evict clean
> + * file-backed folios if they are no longer mapped.
There is no such parameter.
But I do wonder if such a parameter would be better than passing in the MM here.
Also, is there a way to avoid moving the function?
--
Cheers,
David
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-24 7:56 ` David Hildenbrand (Arm)
@ 2026-04-24 21:24 ` Minchan Kim
2026-04-27 9:29 ` David Hildenbrand (Arm)
0 siblings, 1 reply; 21+ messages in thread
From: Minchan Kim @ 2026-04-24 21:24 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: akpm, hca, linux-s390, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray
On Fri, Apr 24, 2026 at 09:56:01AM +0200, David Hildenbrand (Arm) wrote:
> On 4/22/26 01:02, Minchan Kim wrote:
>
> Can we make the subject easier to understand?
>
> "mm: process_mrelease: evict clean file folios when reaping a process"
>
> > Currently, process_mrelease() unmaps pages but file-backed pages are
> > not evicted and stay in the pagecache, relying on standard memory reclaim
> > (kswapd or direct reclaim) to eventually free them. This delays the
> > immediate recovery of system memory under Android's LMKD scenarios,
> > leading to redundant background apps kills.
> >
> > This patch implements an expedited eviction mechanism for clean pagecache
> > folios in the mmu_gather code, similar to how swapcache folios are handled.
> > It drops them from the pagecache (i.e., evicting them) if they are completely
> > unmapped during reaping.
> >
> > Within this single unified loop, anonymous pages are released via
> > free_swap_cache(), and file-backed folios are symmetrically released via
> > free_file_cache().
> >
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> > arch/s390/include/asm/tlb.h | 2 +-
> > include/linux/swap.h | 5 ++---
> > mm/mmu_gather.c | 7 ++++---
> > mm/swap.c | 42 +++++++++++++++++++++++++++++++++++++
> > mm/swap_state.c | 26 -----------------------
> > 5 files changed, 49 insertions(+), 33 deletions(-)
> >
> > diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
> > index 619fd41e710e..2736dbb571a8 100644
> > --- a/arch/s390/include/asm/tlb.h
> > +++ b/arch/s390/include/asm/tlb.h
> > @@ -62,7 +62,7 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
> > VM_WARN_ON_ONCE(delay_rmap);
> > VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
> >
> > - free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
> > + free_pages_and_caches(tlb->mm, encoded_pages, ARRAY_SIZE(encoded_pages));
> > return false;
> > }
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 62fc7499b408..bdb784966343 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -414,7 +414,9 @@ extern int sysctl_min_unmapped_ratio;
> > extern int sysctl_min_slab_ratio;
> > #endif
> >
> > +struct mm_struct;
> > void check_move_unevictable_folios(struct folio_batch *fbatch);
> > +void free_pages_and_caches(struct mm_struct *mm, struct encoded_page **pages, int nr);
> >
> > extern void __meminit kswapd_run(int nid);
> > extern void __meminit kswapd_stop(int nid);
> > @@ -433,7 +435,6 @@ static inline unsigned long total_swapcache_pages(void)
> >
> > void free_swap_cache(struct folio *folio);
> > void free_folio_and_swap_cache(struct folio *folio);
> > -void free_pages_and_swap_cache(struct encoded_page **, int);
> > /* linux/mm/swapfile.c */
> > extern atomic_long_t nr_swap_pages;
> > extern long total_swap_pages;
> > @@ -510,8 +511,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
> > do { (val)->freeswap = (val)->totalswap = 0; } while (0)
> > #define free_folio_and_swap_cache(folio) \
> > folio_put(folio)
> > -#define free_pages_and_swap_cache(pages, nr) \
> > - release_pages((pages), (nr));
> >
> > static inline void free_swap_cache(struct folio *folio)
> > {
> > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> > index fe5b6a031717..3c6c315d3c48 100644
> > --- a/mm/mmu_gather.c
> > +++ b/mm/mmu_gather.c
> > @@ -100,7 +100,8 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
> > */
> > #define MAX_NR_FOLIOS_PER_FREE 512
> >
> > -static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
> > +static void __tlb_batch_free_encoded_pages(struct mm_struct *mm,
> > + struct mmu_gather_batch *batch)
> > {
> > struct encoded_page **pages = batch->encoded_pages;
> > unsigned int nr, nr_pages;
> > @@ -135,7 +136,7 @@ static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
> > }
> > }
> >
> > - free_pages_and_swap_cache(pages, nr);
> > + free_pages_and_caches(mm, pages, nr);
> > pages += nr;
> > batch->nr -= nr;
> >
> > @@ -148,7 +149,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
> > struct mmu_gather_batch *batch;
> >
> > for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
> > - __tlb_batch_free_encoded_pages(batch);
> > + __tlb_batch_free_encoded_pages(tlb->mm, batch);
> > tlb->active = &tlb->local;
> > }
> >
> > diff --git a/mm/swap.c b/mm/swap.c
> > index bb19ccbece46..e44bc8cefceb 100644
> > --- a/mm/swap.c
> > +++ b/mm/swap.c
> > @@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
> > }
> > EXPORT_SYMBOL(release_pages);
> >
> > +static inline void free_file_cache(struct folio *folio)
> > +{
> > + if (folio_trylock(folio)) {
> > + mapping_evict_folio(folio_mapping(folio), folio);
> > + folio_unlock(folio);
> > + }
> > +}
> > +
> > +/*
> > + * Passed an array of pages, drop them all from swapcache and then release
> > + * them. They are removed from the LRU and freed if this is their last use.
> > + *
> > + * If @try_evict_file_folios is true, this function will proactively evict clean
> > + * file-backed folios if they are no longer mapped.
>
> There is no such parameter.
>
> But I do wonder if such a parameter would be better than passing in the MM here.
Makes sense.
>
> Also, is there a way to avoid moving the function?
I guess you are talking about "free_pages_and_swap_cache".
The problem is mm/swap_state.c is conditionally compiled only when CONFIG_SWAP
is enabled.
Since the expedited clean file cache eviction should be available
even on !CONFIG_SWAP, we need this function to be compiled unconditionally.
That is why I moved it to swap.c, which is always compiled.
I am open for any suggestion.
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-24 21:24 ` Minchan Kim
@ 2026-04-27 9:29 ` David Hildenbrand (Arm)
0 siblings, 0 replies; 21+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-27 9:29 UTC (permalink / raw)
To: Minchan Kim
Cc: akpm, hca, linux-s390, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray
On 4/24/26 23:24, Minchan Kim wrote:
> On Fri, Apr 24, 2026 at 09:56:01AM +0200, David Hildenbrand (Arm) wrote:
>> On 4/22/26 01:02, Minchan Kim wrote:
>>
>> Can we make the subject easier to understand?
>>
>> "mm: process_mrelease: evict clean file folios when reaping a process"
>>
>>> Currently, process_mrelease() unmaps pages but file-backed pages are
>>> not evicted and stay in the pagecache, relying on standard memory reclaim
>>> (kswapd or direct reclaim) to eventually free them. This delays the
>>> immediate recovery of system memory under Android's LMKD scenarios,
>>> leading to redundant background apps kills.
>>>
>>> This patch implements an expedited eviction mechanism for clean pagecache
>>> folios in the mmu_gather code, similar to how swapcache folios are handled.
>>> It drops them from the pagecache (i.e., evicting them) if they are completely
>>> unmapped during reaping.
>>>
>>> Within this single unified loop, anonymous pages are released via
>>> free_swap_cache(), and file-backed folios are symmetrically released via
>>> free_file_cache().
>>>
>>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>>> ---
>>> arch/s390/include/asm/tlb.h | 2 +-
>>> include/linux/swap.h | 5 ++---
>>> mm/mmu_gather.c | 7 ++++---
>>> mm/swap.c | 42 +++++++++++++++++++++++++++++++++++++
>>> mm/swap_state.c | 26 -----------------------
>>> 5 files changed, 49 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
>>> index 619fd41e710e..2736dbb571a8 100644
>>> --- a/arch/s390/include/asm/tlb.h
>>> +++ b/arch/s390/include/asm/tlb.h
>>> @@ -62,7 +62,7 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
>>> VM_WARN_ON_ONCE(delay_rmap);
>>> VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
>>>
>>> - free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
>>> + free_pages_and_caches(tlb->mm, encoded_pages, ARRAY_SIZE(encoded_pages));
>>> return false;
>>> }
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 62fc7499b408..bdb784966343 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -414,7 +414,9 @@ extern int sysctl_min_unmapped_ratio;
>>> extern int sysctl_min_slab_ratio;
>>> #endif
>>>
>>> +struct mm_struct;
>>> void check_move_unevictable_folios(struct folio_batch *fbatch);
>>> +void free_pages_and_caches(struct mm_struct *mm, struct encoded_page **pages, int nr);
>>>
>>> extern void __meminit kswapd_run(int nid);
>>> extern void __meminit kswapd_stop(int nid);
>>> @@ -433,7 +435,6 @@ static inline unsigned long total_swapcache_pages(void)
>>>
>>> void free_swap_cache(struct folio *folio);
>>> void free_folio_and_swap_cache(struct folio *folio);
>>> -void free_pages_and_swap_cache(struct encoded_page **, int);
>>> /* linux/mm/swapfile.c */
>>> extern atomic_long_t nr_swap_pages;
>>> extern long total_swap_pages;
>>> @@ -510,8 +511,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
>>> do { (val)->freeswap = (val)->totalswap = 0; } while (0)
>>> #define free_folio_and_swap_cache(folio) \
>>> folio_put(folio)
>>> -#define free_pages_and_swap_cache(pages, nr) \
>>> - release_pages((pages), (nr));
>>>
>>> static inline void free_swap_cache(struct folio *folio)
>>> {
>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
>>> index fe5b6a031717..3c6c315d3c48 100644
>>> --- a/mm/mmu_gather.c
>>> +++ b/mm/mmu_gather.c
>>> @@ -100,7 +100,8 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
>>> */
>>> #define MAX_NR_FOLIOS_PER_FREE 512
>>>
>>> -static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
>>> +static void __tlb_batch_free_encoded_pages(struct mm_struct *mm,
>>> + struct mmu_gather_batch *batch)
>>> {
>>> struct encoded_page **pages = batch->encoded_pages;
>>> unsigned int nr, nr_pages;
>>> @@ -135,7 +136,7 @@ static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
>>> }
>>> }
>>>
>>> - free_pages_and_swap_cache(pages, nr);
>>> + free_pages_and_caches(mm, pages, nr);
>>> pages += nr;
>>> batch->nr -= nr;
>>>
>>> @@ -148,7 +149,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
>>> struct mmu_gather_batch *batch;
>>>
>>> for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
>>> - __tlb_batch_free_encoded_pages(batch);
>>> + __tlb_batch_free_encoded_pages(tlb->mm, batch);
>>> tlb->active = &tlb->local;
>>> }
>>>
>>> diff --git a/mm/swap.c b/mm/swap.c
>>> index bb19ccbece46..e44bc8cefceb 100644
>>> --- a/mm/swap.c
>>> +++ b/mm/swap.c
>>> @@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
>>> }
>>> EXPORT_SYMBOL(release_pages);
>>>
>>> +static inline void free_file_cache(struct folio *folio)
>>> +{
>>> + if (folio_trylock(folio)) {
>>> + mapping_evict_folio(folio_mapping(folio), folio);
>>> + folio_unlock(folio);
>>> + }
>>> +}
>>> +
>>> +/*
>>> + * Passed an array of pages, drop them all from swapcache and then release
>>> + * them. They are removed from the LRU and freed if this is their last use.
>>> + *
>>> + * If @try_evict_file_folios is true, this function will proactively evict clean
>>> + * file-backed folios if they are no longer mapped.
>>
>> There is no such parameter.
>>
>> But I do wonder if such a parameter would be better than passing in the MM here.
>
> Makes sense.
>
>>
>> Also, is there a way to avoid moving the function?
>
> I guess you are talking about "free_pages_and_swap_cache".
>
> The problem is mm/swap_state.c is conditionally compiled only when CONFIG_SWAP
> is enabled.
>
> Since the expedited clean file cache eviction should be available
> even on !CONFIG_SWAP, we need this function to be compiled unconditionally.
> That is why I moved it to swap.c, which is always compiled.
Ah, right. Please spell that out in the patch description!
--
Cheers,
David
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-21 23:02 ` [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather Minchan Kim
2026-04-24 7:56 ` David Hildenbrand (Arm)
@ 2026-04-24 19:33 ` Matthew Wilcox
2026-04-24 21:56 ` Minchan Kim
1 sibling, 1 reply; 21+ messages in thread
From: Matthew Wilcox @ 2026-04-24 19:33 UTC (permalink / raw)
To: Minchan Kim
Cc: akpm, hca, linux-s390, david, mhocko, brauner, linux-mm,
linux-kernel, surenb, timmurray, Minchan Kim
On Tue, Apr 21, 2026 at 04:02:37PM -0700, Minchan Kim wrote:
> +++ b/mm/swap.c
> @@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
> }
> EXPORT_SYMBOL(release_pages);
>
> +static inline void free_file_cache(struct folio *folio)
> +{
> + if (folio_trylock(folio)) {
> + mapping_evict_folio(folio_mapping(folio), folio);
If we already know that the folio is for a file (and I think we do?)
then we can just use folio->mapping here. On the other hand, if it
could be KSM or something else weird, carry on.
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather
2026-04-24 19:33 ` Matthew Wilcox
@ 2026-04-24 21:56 ` Minchan Kim
0 siblings, 0 replies; 21+ messages in thread
From: Minchan Kim @ 2026-04-24 21:56 UTC (permalink / raw)
To: Matthew Wilcox
Cc: akpm, hca, linux-s390, david, mhocko, brauner, linux-mm,
linux-kernel, surenb, timmurray
On Fri, Apr 24, 2026 at 08:33:03PM +0100, Matthew Wilcox wrote:
> On Tue, Apr 21, 2026 at 04:02:37PM -0700, Minchan Kim wrote:
> > +++ b/mm/swap.c
> > @@ -1043,6 +1043,48 @@ void release_pages(release_pages_arg arg, int nr)
> > }
> > EXPORT_SYMBOL(release_pages);
> >
> > +static inline void free_file_cache(struct folio *folio)
> > +{
> > + if (folio_trylock(folio)) {
> > + mapping_evict_folio(folio_mapping(folio), folio);
>
> If we already know that the folio is for a file (and I think we do?)
> then we can just use folio->mapping here. On the other hand, if it
> could be KSM or something else weird, carry on.
Thanks for the review. It made me think about the shmem corner cases.
Since we already check folio_test_anon(folio) before calling this path,
we know we are dealing with non-anonymous folios.
My specific concern was shmem folios, which are not anonymous but can
be in the swap cache. While mapping_evict_folio() might technically work
for them at this point (since remove_mapping handles it but I might miss),
it feels unintentional and fragile because mapping_evict_folio() is
primarily designed for page cache eviction, not swap cache.
To make this robust and safely adopt your suggestion of using folio->mapping
directly, I think we should handle swap cache folios explicitly
in the main loop like this:
void free_pages_and_caches(struct encoded_page **pages, int nr,
bool try_evict_file_folios)
{
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
if (folio_test_anon(folio) || folio_test_swapcache(folio))
free_swap_cache(folio);
else if (unlikely(try_evict_file_folios))
free_file_cache(folio);
...
}
}
And then we can use folio->mapping directly in the helper:
static inline void free_file_cache(struct folio *folio)
{
if (folio_trylock(folio)) {
mapping_evict_folio(folio->mapping, folio);
folio_unlock(folio);
}
}
This way, we are guaranteed that anything reaching free_file_cache() is a
non-swapcache file folio, making the direct use of folio->mapping safe.
Please let me know if I am missing something here.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-21 23:02 [PATCH v1 0/3] mm: process_mrelease: expedite clean file folio reclaim and add auto-kill Minchan Kim
2026-04-21 23:02 ` [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather Minchan Kim
@ 2026-04-21 23:02 ` Minchan Kim
2026-04-22 7:22 ` Baolin Wang
2026-04-24 7:51 ` Michal Hocko
2026-04-21 23:02 ` [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag Minchan Kim
2 siblings, 2 replies; 21+ messages in thread
From: Minchan Kim @ 2026-04-21 23:02 UTC (permalink / raw)
To: akpm
Cc: hca, linux-s390, david, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim, Minchan Kim
For the process_mrelease reclaim, skip LRU handling for exclusive
file-backed folios since they will be freed soon so pointless
to move around in the LRU.
This avoids costly LRU movement which accounts for a significant portion
of the time during unmap_page_range.
- 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
exit_mm
__mmput
exit_mmap
unmap_vmas
- unmap_page_range
- 55.75% folio_mark_accessed
+ 48.79% __folio_batch_add_and_move
4.23% workingset_activation
+ 12.94% folio_remove_rmap_ptes
+ 9.86% page_table_check_clear
+ 3.34% tlb_flush_mmu
1.06% __page_table_check_pte_clear
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
mm/memory.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..fcb57630bb8d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1640,6 +1640,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
bool delay_rmap = false;
if (!folio_test_anon(folio)) {
+ bool skip_mark_accessed;
+
ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
if (pte_dirty(ptent)) {
folio_mark_dirty(folio);
@@ -1648,7 +1650,16 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
*force_flush = true;
}
}
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
+
+ /*
+ * For the process_mrelease reclaim, skip LRU handling for exclusive
+ * file-backed folios since they will be freed soon so pointless
+ * to move around in the LRU.
+ */
+ skip_mark_accessed = mm_flags_test(MMF_UNSTABLE, mm) &&
+ !folio_maybe_mapped_shared(folio);
+ if (likely(!skip_mark_accessed) && pte_young(ptent) &&
+ likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
rss[mm_counter(folio)] -= nr;
} else {
--
2.54.0.rc1.555.g9c883467ad-goog
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-21 23:02 ` [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios Minchan Kim
@ 2026-04-22 7:22 ` Baolin Wang
2026-04-23 23:38 ` Minchan Kim
2026-04-24 7:51 ` Michal Hocko
1 sibling, 1 reply; 21+ messages in thread
From: Baolin Wang @ 2026-04-22 7:22 UTC (permalink / raw)
To: Minchan Kim, akpm
Cc: hca, linux-s390, david, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim
On 4/22/26 7:02 AM, Minchan Kim wrote:
> For the process_mrelease reclaim, skip LRU handling for exclusive
> file-backed folios since they will be freed soon so pointless
> to move around in the LRU.
>
> This avoids costly LRU movement which accounts for a significant portion
> of the time during unmap_page_range.
>
> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> exit_mm
> __mmput
> exit_mmap
> unmap_vmas
> - unmap_page_range
> - 55.75% folio_mark_accessed
> + 48.79% __folio_batch_add_and_move
> 4.23% workingset_activation
> + 12.94% folio_remove_rmap_ptes
> + 9.86% page_table_check_clear
> + 3.34% tlb_flush_mmu
> 1.06% __page_table_check_pte_clear
>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
> mm/memory.c | 13 ++++++++++++-
> 1 file changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 2f815a34d924..fcb57630bb8d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1640,6 +1640,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> bool delay_rmap = false;
>
> if (!folio_test_anon(folio)) {
> + bool skip_mark_accessed;
> +
> ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> if (pte_dirty(ptent)) {
> folio_mark_dirty(folio);
> @@ -1648,7 +1650,16 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> *force_flush = true;
> }
> }
> - if (pte_young(ptent) && likely(vma_has_recency(vma)))
> +
> + /*
> + * For the process_mrelease reclaim, skip LRU handling for exclusive
> + * file-backed folios since they will be freed soon so pointless
> + * to move around in the LRU.
> + */
> + skip_mark_accessed = mm_flags_test(MMF_UNSTABLE, mm) &&
> + !folio_maybe_mapped_shared(folio);
> + if (likely(!skip_mark_accessed) && pte_young(ptent) &&
> + likely(vma_has_recency(vma)))
> folio_mark_accessed(folio);
> rss[mm_counter(folio)] -= nr;
> } else {
Seems we should also skip marking exclusive pmd-sized folios as accessed
in zap_huge_pmd_folio().
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-22 7:22 ` Baolin Wang
@ 2026-04-23 23:38 ` Minchan Kim
0 siblings, 0 replies; 21+ messages in thread
From: Minchan Kim @ 2026-04-23 23:38 UTC (permalink / raw)
To: Baolin Wang
Cc: akpm, hca, linux-s390, david, mhocko, brauner, linux-mm,
linux-kernel, surenb, timmurray
On Wed, Apr 22, 2026 at 03:22:46PM +0800, Baolin Wang wrote:
>
>
> On 4/22/26 7:02 AM, Minchan Kim wrote:
> > For the process_mrelease reclaim, skip LRU handling for exclusive
> > file-backed folios since they will be freed soon so pointless
> > to move around in the LRU.
> >
> > This avoids costly LRU movement which accounts for a significant portion
> > of the time during unmap_page_range.
> >
> > - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> > exit_mm
> > __mmput
> > exit_mmap
> > unmap_vmas
> > - unmap_page_range
> > - 55.75% folio_mark_accessed
> > + 48.79% __folio_batch_add_and_move
> > 4.23% workingset_activation
> > + 12.94% folio_remove_rmap_ptes
> > + 9.86% page_table_check_clear
> > + 3.34% tlb_flush_mmu
> > 1.06% __page_table_check_pte_clear
> >
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> > mm/memory.c | 13 ++++++++++++-
> > 1 file changed, 12 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 2f815a34d924..fcb57630bb8d 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1640,6 +1640,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> > bool delay_rmap = false;
> > if (!folio_test_anon(folio)) {
> > + bool skip_mark_accessed;
> > +
> > ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> > if (pte_dirty(ptent)) {
> > folio_mark_dirty(folio);
> > @@ -1648,7 +1650,16 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> > *force_flush = true;
> > }
> > }
> > - if (pte_young(ptent) && likely(vma_has_recency(vma)))
> > +
> > + /*
> > + * For the process_mrelease reclaim, skip LRU handling for exclusive
> > + * file-backed folios since they will be freed soon so pointless
> > + * to move around in the LRU.
> > + */
> > + skip_mark_accessed = mm_flags_test(MMF_UNSTABLE, mm) &&
> > + !folio_maybe_mapped_shared(folio);
> > + if (likely(!skip_mark_accessed) && pte_young(ptent) &&
> > + likely(vma_has_recency(vma)))
> > folio_mark_accessed(folio);
> > rss[mm_counter(folio)] -= nr;
> > } else {
>
> Seems we should also skip marking exclusive pmd-sized folios as accessed in
> zap_huge_pmd_folio().
Good catch.
Will update in next revision.
Thank you.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-21 23:02 ` [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios Minchan Kim
2026-04-22 7:22 ` Baolin Wang
@ 2026-04-24 7:51 ` Michal Hocko
2026-04-24 7:57 ` David Hildenbrand (Arm)
2026-04-24 19:26 ` Minchan Kim
1 sibling, 2 replies; 21+ messages in thread
From: Michal Hocko @ 2026-04-24 7:51 UTC (permalink / raw)
To: Minchan Kim
Cc: akpm, hca, linux-s390, david, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim
On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> For the process_mrelease reclaim, skip LRU handling for exclusive
> file-backed folios since they will be freed soon so pointless
> to move around in the LRU.
>
> This avoids costly LRU movement which accounts for a significant portion
> of the time during unmap_page_range.
>
> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> exit_mm
> __mmput
> exit_mmap
> unmap_vmas
> - unmap_page_range
> - 55.75% folio_mark_accessed
> + 48.79% __folio_batch_add_and_move
> 4.23% workingset_activation
> + 12.94% folio_remove_rmap_ptes
> + 9.86% page_table_check_clear
> + 3.34% tlb_flush_mmu
> 1.06% __page_table_check_pte_clear
>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
As pointed out in the previous version of the patch. I really dislike
this to be mrelease or OOM specific. Behavior. You do not explain why
this needs to be this way, except for the performance reasons. My main
question is still unanswered (and NAK before this is sorted out). Why
this cannot be applied in general for _any_ exiting task. As you argue
the memory will just likely go away so why to bother?
> ---
> mm/memory.c | 13 ++++++++++++-
> 1 file changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 2f815a34d924..fcb57630bb8d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1640,6 +1640,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> bool delay_rmap = false;
>
> if (!folio_test_anon(folio)) {
> + bool skip_mark_accessed;
> +
> ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
> if (pte_dirty(ptent)) {
> folio_mark_dirty(folio);
> @@ -1648,7 +1650,16 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
> *force_flush = true;
> }
> }
> - if (pte_young(ptent) && likely(vma_has_recency(vma)))
> +
> + /*
> + * For the process_mrelease reclaim, skip LRU handling for exclusive
> + * file-backed folios since they will be freed soon so pointless
> + * to move around in the LRU.
> + */
> + skip_mark_accessed = mm_flags_test(MMF_UNSTABLE, mm) &&
> + !folio_maybe_mapped_shared(folio);
> + if (likely(!skip_mark_accessed) && pte_young(ptent) &&
> + likely(vma_has_recency(vma)))
> folio_mark_accessed(folio);
> rss[mm_counter(folio)] -= nr;
> } else {
> --
> 2.54.0.rc1.555.g9c883467ad-goog
>
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-24 7:51 ` Michal Hocko
@ 2026-04-24 7:57 ` David Hildenbrand (Arm)
2026-04-24 19:15 ` Minchan Kim
2026-04-24 19:26 ` Minchan Kim
1 sibling, 1 reply; 21+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-24 7:57 UTC (permalink / raw)
To: Michal Hocko, Minchan Kim
Cc: akpm, hca, linux-s390, brauner, linux-mm, linux-kernel, surenb,
timmurray, Minchan Kim
On 4/24/26 09:51, Michal Hocko wrote:
> On Tue 21-04-26 16:02:38, Minchan Kim wrote:
>> For the process_mrelease reclaim, skip LRU handling for exclusive
>> file-backed folios since they will be freed soon so pointless
>> to move around in the LRU.
>>
>> This avoids costly LRU movement which accounts for a significant portion
>> of the time during unmap_page_range.
>>
>> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
>> exit_mm
>> __mmput
>> exit_mmap
>> unmap_vmas
>> - unmap_page_range
>> - 55.75% folio_mark_accessed
>> + 48.79% __folio_batch_add_and_move
>> 4.23% workingset_activation
>> + 12.94% folio_remove_rmap_ptes
>> + 9.86% page_table_check_clear
>> + 3.34% tlb_flush_mmu
>> 1.06% __page_table_check_pte_clear
>>
>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>
> As pointed out in the previous version of the patch. I really dislike
> this to be mrelease or OOM specific. Behavior. You do not explain why
> this needs to be this way, except for the performance reasons. My main
> question is still unanswered (and NAK before this is sorted out). Why
> this cannot be applied in general for _any_ exiting task. As you argue
> the memory will just likely go away so why to bother?
I think there was a lengthy discussion involving Johannes from a previous series.
That should be linked here indeed.
--
Cheers,
David
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-24 7:57 ` David Hildenbrand (Arm)
@ 2026-04-24 19:15 ` Minchan Kim
2026-04-27 7:16 ` Michal Hocko
0 siblings, 1 reply; 21+ messages in thread
From: Minchan Kim @ 2026-04-24 19:15 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: Michal Hocko, akpm, hca, linux-s390, brauner, linux-mm,
linux-kernel, surenb, timmurray
On Fri, Apr 24, 2026 at 09:57:16AM +0200, David Hildenbrand (Arm) wrote:
> On 4/24/26 09:51, Michal Hocko wrote:
> > On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> >> For the process_mrelease reclaim, skip LRU handling for exclusive
> >> file-backed folios since they will be freed soon so pointless
> >> to move around in the LRU.
> >>
> >> This avoids costly LRU movement which accounts for a significant portion
> >> of the time during unmap_page_range.
> >>
> >> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> >> exit_mm
> >> __mmput
> >> exit_mmap
> >> unmap_vmas
> >> - unmap_page_range
> >> - 55.75% folio_mark_accessed
> >> + 48.79% __folio_batch_add_and_move
> >> 4.23% workingset_activation
> >> + 12.94% folio_remove_rmap_ptes
> >> + 9.86% page_table_check_clear
> >> + 3.34% tlb_flush_mmu
> >> 1.06% __page_table_check_pte_clear
> >>
> >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> >
> > As pointed out in the previous version of the patch. I really dislike
> > this to be mrelease or OOM specific. Behavior. You do not explain why
> > this needs to be this way, except for the performance reasons. My main
> > question is still unanswered (and NAK before this is sorted out). Why
> > this cannot be applied in general for _any_ exiting task. As you argue
> > the memory will just likely go away so why to bother?
>
> I think there was a lengthy discussion involving Johannes from a previous series.
>
> That should be linked here indeed.
How about this?
mm: process_mrelease: skip LRU movement for exclusive file folios
During process_mrelease() or OOM reaping, unmapping file-backed folios
spends a significant portion of CPU time in folio_mark_accessed() to
maintain accurate LRU state (~55% of unmap time as shown in the profile
below).
This patch skips LRU handling for exclusive file-backed folios during
such emergency memory reclaim.
One might ask why this optimization shouldn't be applied to any exiting
task in general. The reason is that for a normal, orderly exit or just
pure kill, it is worth paying the CPU cost to preserve the active state
of clean file folios in case they are reused soon. Preserving cache hits
is beneficial for overall system performance.
However, process_mrelease() and OOM reaping are emergency operations
triggered under extreme memory pressure. In these scenarios, the highest
priority is to recover memory as quickly as possible to avoid further
kills or system jank. Spending half of the unmap time on LRU maintenance
for pages belonging to a victim process is a bad trade-off. If speeding up
the victim's reclaim by avoiding LRU movement and evicting cache negatively
affects the workflow (due to immediate restart), it implies a sub-optimal
kill target selection by the userspace policy (e.g., LMKD), rather than
a problem in this expedited APIs.
Therefore, we choose to prioritize immediate CPU savings and faster
memory recovery over potential future cache hits for the specific victim's
files.
Profile showing the overhead of folio_mark_accessed during unmap:
- 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
exit_mm
__mmput
exit_mmap
unmap_vmas
- unmap_page_range
- 55.75% folio_mark_accessed
+ 48.79% __folio_batch_add_and_move
4.23% workingset_activation
+ 12.94% folio_remove_rmap_ptes
+ 9.86% page_table_check_clear
+ 3.34% tlb_flush_mmu
1.06% __page_table_check_pte_clear
Signed-off-by: Minchan Kim <minchan@kernel.org>
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-24 19:15 ` Minchan Kim
@ 2026-04-27 7:16 ` Michal Hocko
2026-04-27 16:48 ` Suren Baghdasaryan
0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2026-04-27 7:16 UTC (permalink / raw)
To: Minchan Kim
Cc: David Hildenbrand (Arm), akpm, hca, linux-s390, brauner, linux-mm,
linux-kernel, surenb, timmurray
On Fri 24-04-26 12:15:18, Minchan Kim wrote:
> On Fri, Apr 24, 2026 at 09:57:16AM +0200, David Hildenbrand (Arm) wrote:
> > On 4/24/26 09:51, Michal Hocko wrote:
> > > On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> > >> For the process_mrelease reclaim, skip LRU handling for exclusive
> > >> file-backed folios since they will be freed soon so pointless
> > >> to move around in the LRU.
> > >>
> > >> This avoids costly LRU movement which accounts for a significant portion
> > >> of the time during unmap_page_range.
> > >>
> > >> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> > >> exit_mm
> > >> __mmput
> > >> exit_mmap
> > >> unmap_vmas
> > >> - unmap_page_range
> > >> - 55.75% folio_mark_accessed
> > >> + 48.79% __folio_batch_add_and_move
> > >> 4.23% workingset_activation
> > >> + 12.94% folio_remove_rmap_ptes
> > >> + 9.86% page_table_check_clear
> > >> + 3.34% tlb_flush_mmu
> > >> 1.06% __page_table_check_pte_clear
> > >>
> > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > >
> > > As pointed out in the previous version of the patch. I really dislike
> > > this to be mrelease or OOM specific. Behavior. You do not explain why
> > > this needs to be this way, except for the performance reasons. My main
> > > question is still unanswered (and NAK before this is sorted out). Why
> > > this cannot be applied in general for _any_ exiting task. As you argue
> > > the memory will just likely go away so why to bother?
> >
> > I think there was a lengthy discussion involving Johannes from a previous series.
> >
> > That should be linked here indeed.
>
> How about this?
>
> mm: process_mrelease: skip LRU movement for exclusive file folios
>
> During process_mrelease() or OOM reaping, unmapping file-backed folios
> spends a significant portion of CPU time in folio_mark_accessed() to
> maintain accurate LRU state (~55% of unmap time as shown in the profile
> below).
>
> This patch skips LRU handling for exclusive file-backed folios during
> such emergency memory reclaim.
>
> One might ask why this optimization shouldn't be applied to any exiting
> task in general. The reason is that for a normal, orderly exit or just
> pure kill, it is worth paying the CPU cost to preserve the active state
> of clean file folios in case they are reused soon. Preserving cache hits
> is beneficial for overall system performance.
This is a statement rather than an explanation. Why is it worth paying
the cost? What is different here?
> However, process_mrelease() and OOM reaping are emergency operations
> triggered under extreme memory pressure. In these scenarios, the highest
> priority is to recover memory as quickly as possible to avoid further
> kills or system jank. Spending half of the unmap time on LRU maintenance
> for pages belonging to a victim process is a bad trade-off. If speeding up
> the victim's reclaim by avoiding LRU movement and evicting cache negatively
> affects the workflow (due to immediate restart), it implies a sub-optimal
> kill target selection by the userspace policy (e.g., LMKD), rather than
> a problem in this expedited APIs.
Your change effectively boils down to break aging for exclusively mapped
file pages when those pages should have been activated. All that because
the activation has some (batched) overhead. You argue that the overhead
is not a good trade-off for OOM path because those pages are exclusive
to the process and therefore they will go away after the task exits.
The same line of argument applies to task exiting normally too. Task
exit it not the most hot path but certainly something noticeable,
especially so for huge tasks.
All that being said, you really need to focus why breaking the aging is
a worth optimization. Keep in mind that while the page might be
exlusively mapped it could still be actively consumed from the page
cache and breaking the aging could lead to refaults.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-27 7:16 ` Michal Hocko
@ 2026-04-27 16:48 ` Suren Baghdasaryan
2026-04-27 17:15 ` Michal Hocko
0 siblings, 1 reply; 21+ messages in thread
From: Suren Baghdasaryan @ 2026-04-27 16:48 UTC (permalink / raw)
To: Michal Hocko
Cc: Minchan Kim, David Hildenbrand (Arm), akpm, hca, linux-s390,
brauner, linux-mm, linux-kernel, timmurray
On Mon, Apr 27, 2026 at 12:16 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 24-04-26 12:15:18, Minchan Kim wrote:
> > On Fri, Apr 24, 2026 at 09:57:16AM +0200, David Hildenbrand (Arm) wrote:
> > > On 4/24/26 09:51, Michal Hocko wrote:
> > > > On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> > > >> For the process_mrelease reclaim, skip LRU handling for exclusive
> > > >> file-backed folios since they will be freed soon so pointless
> > > >> to move around in the LRU.
> > > >>
> > > >> This avoids costly LRU movement which accounts for a significant portion
> > > >> of the time during unmap_page_range.
> > > >>
> > > >> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> > > >> exit_mm
> > > >> __mmput
> > > >> exit_mmap
> > > >> unmap_vmas
> > > >> - unmap_page_range
> > > >> - 55.75% folio_mark_accessed
> > > >> + 48.79% __folio_batch_add_and_move
> > > >> 4.23% workingset_activation
> > > >> + 12.94% folio_remove_rmap_ptes
> > > >> + 9.86% page_table_check_clear
> > > >> + 3.34% tlb_flush_mmu
> > > >> 1.06% __page_table_check_pte_clear
> > > >>
> > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > >
> > > > As pointed out in the previous version of the patch. I really dislike
> > > > this to be mrelease or OOM specific. Behavior. You do not explain why
> > > > this needs to be this way, except for the performance reasons. My main
> > > > question is still unanswered (and NAK before this is sorted out). Why
> > > > this cannot be applied in general for _any_ exiting task. As you argue
> > > > the memory will just likely go away so why to bother?
> > >
> > > I think there was a lengthy discussion involving Johannes from a previous series.
> > >
> > > That should be linked here indeed.
> >
> > How about this?
> >
> > mm: process_mrelease: skip LRU movement for exclusive file folios
> >
> > During process_mrelease() or OOM reaping, unmapping file-backed folios
> > spends a significant portion of CPU time in folio_mark_accessed() to
> > maintain accurate LRU state (~55% of unmap time as shown in the profile
> > below).
> >
> > This patch skips LRU handling for exclusive file-backed folios during
> > such emergency memory reclaim.
> >
> > One might ask why this optimization shouldn't be applied to any exiting
> > task in general. The reason is that for a normal, orderly exit or just
> > pure kill, it is worth paying the CPU cost to preserve the active state
> > of clean file folios in case they are reused soon. Preserving cache hits
> > is beneficial for overall system performance.
>
> This is a statement rather than an explanation. Why is it worth paying
> the cost? What is different here?
>
> > However, process_mrelease() and OOM reaping are emergency operations
> > triggered under extreme memory pressure. In these scenarios, the highest
> > priority is to recover memory as quickly as possible to avoid further
> > kills or system jank. Spending half of the unmap time on LRU maintenance
> > for pages belonging to a victim process is a bad trade-off. If speeding up
> > the victim's reclaim by avoiding LRU movement and evicting cache negatively
> > affects the workflow (due to immediate restart), it implies a sub-optimal
> > kill target selection by the userspace policy (e.g., LMKD), rather than
> > a problem in this expedited APIs.
>
> Your change effectively boils down to break aging for exclusively mapped
> file pages when those pages should have been activated. All that because
> the activation has some (batched) overhead. You argue that the overhead
> is not a good trade-off for OOM path because those pages are exclusive
> to the process and therefore they will go away after the task exits.
I think Minchan's argument is that mm reaping occurs only in special
conditions (under high memory pressure) and for a very specific reason
(to free up memory and prevent system memory starvation). Therefore
priority in such conditions should shift towards more aggressive
memory reclaim instead of normal aging. I can see both his point and a
counter-argument that this might cause more refaults in some cases.
FWIW, in my personal experience working with Android, extra refaults
when we have free memory is better than inefficient reclaim during
memory pressure, but I'm not sure that's universally true.
> The same line of argument applies to task exiting normally too. Task
> exit it not the most hot path but certainly something noticeable,
> especially so for huge tasks.
>
> All that being said, you really need to focus why breaking the aging is
> a worth optimization. Keep in mind that while the page might be
> exlusively mapped it could still be actively consumed from the page
> cache and breaking the aging could lead to refaults.
> --
> Michal Hocko
> SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-27 16:48 ` Suren Baghdasaryan
@ 2026-04-27 17:15 ` Michal Hocko
0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2026-04-27 17:15 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Minchan Kim, David Hildenbrand (Arm), akpm, hca, linux-s390,
brauner, linux-mm, linux-kernel, timmurray
On Mon 27-04-26 09:48:28, Suren Baghdasaryan wrote:
> On Mon, Apr 27, 2026 at 12:16 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Fri 24-04-26 12:15:18, Minchan Kim wrote:
> > > On Fri, Apr 24, 2026 at 09:57:16AM +0200, David Hildenbrand (Arm) wrote:
> > > > On 4/24/26 09:51, Michal Hocko wrote:
> > > > > On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> > > > >> For the process_mrelease reclaim, skip LRU handling for exclusive
> > > > >> file-backed folios since they will be freed soon so pointless
> > > > >> to move around in the LRU.
> > > > >>
> > > > >> This avoids costly LRU movement which accounts for a significant portion
> > > > >> of the time during unmap_page_range.
> > > > >>
> > > > >> - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> > > > >> exit_mm
> > > > >> __mmput
> > > > >> exit_mmap
> > > > >> unmap_vmas
> > > > >> - unmap_page_range
> > > > >> - 55.75% folio_mark_accessed
> > > > >> + 48.79% __folio_batch_add_and_move
> > > > >> 4.23% workingset_activation
> > > > >> + 12.94% folio_remove_rmap_ptes
> > > > >> + 9.86% page_table_check_clear
> > > > >> + 3.34% tlb_flush_mmu
> > > > >> 1.06% __page_table_check_pte_clear
> > > > >>
> > > > >> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > >
> > > > > As pointed out in the previous version of the patch. I really dislike
> > > > > this to be mrelease or OOM specific. Behavior. You do not explain why
> > > > > this needs to be this way, except for the performance reasons. My main
> > > > > question is still unanswered (and NAK before this is sorted out). Why
> > > > > this cannot be applied in general for _any_ exiting task. As you argue
> > > > > the memory will just likely go away so why to bother?
> > > >
> > > > I think there was a lengthy discussion involving Johannes from a previous series.
> > > >
> > > > That should be linked here indeed.
> > >
> > > How about this?
> > >
> > > mm: process_mrelease: skip LRU movement for exclusive file folios
> > >
> > > During process_mrelease() or OOM reaping, unmapping file-backed folios
> > > spends a significant portion of CPU time in folio_mark_accessed() to
> > > maintain accurate LRU state (~55% of unmap time as shown in the profile
> > > below).
> > >
> > > This patch skips LRU handling for exclusive file-backed folios during
> > > such emergency memory reclaim.
> > >
> > > One might ask why this optimization shouldn't be applied to any exiting
> > > task in general. The reason is that for a normal, orderly exit or just
> > > pure kill, it is worth paying the CPU cost to preserve the active state
> > > of clean file folios in case they are reused soon. Preserving cache hits
> > > is beneficial for overall system performance.
> >
> > This is a statement rather than an explanation. Why is it worth paying
> > the cost? What is different here?
> >
> > > However, process_mrelease() and OOM reaping are emergency operations
> > > triggered under extreme memory pressure. In these scenarios, the highest
> > > priority is to recover memory as quickly as possible to avoid further
> > > kills or system jank. Spending half of the unmap time on LRU maintenance
> > > for pages belonging to a victim process is a bad trade-off. If speeding up
> > > the victim's reclaim by avoiding LRU movement and evicting cache negatively
> > > affects the workflow (due to immediate restart), it implies a sub-optimal
> > > kill target selection by the userspace policy (e.g., LMKD), rather than
> > > a problem in this expedited APIs.
> >
> > Your change effectively boils down to break aging for exclusively mapped
> > file pages when those pages should have been activated. All that because
> > the activation has some (batched) overhead. You argue that the overhead
> > is not a good trade-off for OOM path because those pages are exclusive
> > to the process and therefore they will go away after the task exits.
>
> I think Minchan's argument is that mm reaping occurs only in special
> conditions (under high memory pressure) and for a very specific reason
> (to free up memory and prevent system memory starvation). Therefore
> priority in such conditions should shift towards more aggressive
> memory reclaim instead of normal aging. I can see both his point and a
> counter-argument that this might cause more refaults in some cases.
The way I see this is that the standard memory reclaim under a heavy
memory pressure would likely encounter those pages and aged them
accordingly already. So this is effectivelly racing with that process
and makes a potentially opposite decision.
I suspect that a lack of memory reclaim, as implied by the other patch
(to deal with clean page cache), is the reason why this one makes a
difference in these Android deployments.
Unless I am completely wrong and misreading the whole situation this
might be very Android specific change. The question is whether these
side effects are generally useful for other worklods. So we really need
much more explanation of the actual behavior after this change for wider
variety of workloads.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios
2026-04-24 7:51 ` Michal Hocko
2026-04-24 7:57 ` David Hildenbrand (Arm)
@ 2026-04-24 19:26 ` Minchan Kim
1 sibling, 0 replies; 21+ messages in thread
From: Minchan Kim @ 2026-04-24 19:26 UTC (permalink / raw)
To: Michal Hocko
Cc: akpm, hca, linux-s390, david, brauner, linux-mm, linux-kernel,
surenb, timmurray
On Fri, Apr 24, 2026 at 09:51:22AM +0200, Michal Hocko wrote:
> On Tue 21-04-26 16:02:38, Minchan Kim wrote:
> > For the process_mrelease reclaim, skip LRU handling for exclusive
> > file-backed folios since they will be freed soon so pointless
> > to move around in the LRU.
> >
> > This avoids costly LRU movement which accounts for a significant portion
> > of the time during unmap_page_range.
> >
> > - 91.31% 0.00% mmap_exit_test [kernel.kallsyms] [.] exit_mm
> > exit_mm
> > __mmput
> > exit_mmap
> > unmap_vmas
> > - unmap_page_range
> > - 55.75% folio_mark_accessed
> > + 48.79% __folio_batch_add_and_move
> > 4.23% workingset_activation
> > + 12.94% folio_remove_rmap_ptes
> > + 9.86% page_table_check_clear
> > + 3.34% tlb_flush_mmu
> > 1.06% __page_table_check_pte_clear
> >
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
>
> As pointed out in the previous version of the patch. I really dislike
> this to be mrelease or OOM specific. Behavior. You do not explain why
> this needs to be this way, except for the performance reasons. My main
> question is still unanswered (and NAK before this is sorted out). Why
> this cannot be applied in general for _any_ exiting task. As you argue
> the memory will just likely go away so why to bother?
I revised the description to explain why I wanted to go with only
specific, not in general.
https://lore.kernel.org/linux-mm/aevBRh08X4UTMUj9@google.com/
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag
2026-04-21 23:02 [PATCH v1 0/3] mm: process_mrelease: expedite clean file folio reclaim and add auto-kill Minchan Kim
2026-04-21 23:02 ` [PATCH v1 1/3] mm: process_mrelease: expedite clean file folio reclaim via mmu_gather Minchan Kim
2026-04-21 23:02 ` [PATCH v1 2/3] mm: process_mrelease: skip LRU movement for exclusive file folios Minchan Kim
@ 2026-04-21 23:02 ` Minchan Kim
2026-04-24 7:57 ` Michal Hocko
2 siblings, 1 reply; 21+ messages in thread
From: Minchan Kim @ 2026-04-21 23:02 UTC (permalink / raw)
To: akpm
Cc: hca, linux-s390, david, mhocko, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim, Minchan Kim
Currently, process_mrelease() requires userspace to send a SIGKILL signal
prior to the call. This separation introduces a scheduling race window
where the victim task may receive the signal and enter the exit path
before the reaper can invoke process_mrelease().
When the victim enters the exit path (do_exit -> exit_mm), it clears its
task->mm immediately. This causes process_mrelease() to fail with -ESRCH,
leaving the actual address space teardown (exit_mmap) to be deferred until
the mm's reference count drops to zero. In Android, arbitrary reference counts
(e.g., async I/O, reading /proc/<pid>/cmdline, or various other remote
VM accesses) frequently delay this teardown indefinitely, defeating the
purpose of expedited reclamation.
This delay keeps memory pressure high, forcing the system to unnecessarily
kill additional innocent background apps before the memory from the first
victim is recovered.
This patch introduces the PROCESS_MRELEASE_REAP_KILL UAPI flag to support
an integrated auto-kill mode. When specified, process_mrelease() directly
injects a SIGKILL into the target task.
To solve the race condition deterministically, we grab the mm reference
via mmget() and set the MMF_UNSTABLE flag *before* sending the SIGKILL.
Using mmget() instead of mmgrab() keeps mm_users > 0, preventing the
victim from calling exit_mmap() in its own exit path. This ensures that
the memory is reclaimed synchronously and deterministically by the reaper
in the context of process_mrelease(), avoiding delays caused by
non-deterministic scheduling of the victim task.
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
include/uapi/linux/mman.h | 4 +++
mm/oom_kill.c | 56 +++++++++++++++++++++++++++------------
2 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index e89d00528f2f..4266976b45ad 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -56,4 +56,8 @@ struct cachestat {
__u64 nr_recently_evicted;
};
+/* Flags for process_mrelease */
+#define PROCESS_MRELEASE_REAP_KILL (1 << 0)
+#define PROCESS_MRELEASE_VALID_FLAGS (PROCESS_MRELEASE_REAP_KILL)
+
#endif /* _UAPI_LINUX_MMAN_H */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c6c95c169ee..730ba0d19b53 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <uapi/linux/mman.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/sched.h>
@@ -850,7 +851,7 @@ bool oom_killer_disable(signed long timeout)
return true;
}
-static inline bool __task_will_free_mem(struct task_struct *task)
+static inline bool __task_will_free_mem(struct task_struct *task, bool ignore_exit)
{
struct signal_struct *sig = task->signal;
@@ -862,6 +863,9 @@ static inline bool __task_will_free_mem(struct task_struct *task)
if (sig->core_state)
return false;
+ if (ignore_exit)
+ return true;
+
if (sig->flags & SIGNAL_GROUP_EXIT)
return true;
@@ -878,7 +882,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
-static bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task, bool ignore_exit)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
@@ -892,7 +896,7 @@ static bool task_will_free_mem(struct task_struct *task)
if (!mm)
return false;
- if (!__task_will_free_mem(task))
+ if (!__task_will_free_mem(task, ignore_exit))
return false;
/*
@@ -916,7 +920,7 @@ static bool task_will_free_mem(struct task_struct *task)
continue;
if (same_thread_group(task, p))
continue;
- ret = __task_will_free_mem(p);
+ ret = __task_will_free_mem(p, false);
if (!ret)
break;
}
@@ -1034,7 +1038,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* so it can die quickly
*/
task_lock(victim);
- if (task_will_free_mem(victim)) {
+ if (task_will_free_mem(victim, false)) {
mark_oom_victim(victim);
queue_oom_reaper(victim);
task_unlock(victim);
@@ -1135,7 +1139,7 @@ bool out_of_memory(struct oom_control *oc)
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (task_will_free_mem(current)) {
+ if (task_will_free_mem(current, false)) {
mark_oom_victim(current);
queue_oom_reaper(current);
return true;
@@ -1217,8 +1221,9 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
unsigned int f_flags;
bool reap = false;
long ret = 0;
+ bool reap_kill;
- if (flags)
+ if (flags & ~PROCESS_MRELEASE_VALID_FLAGS)
return -EINVAL;
task = pidfd_get_task(pidfd, &f_flags);
@@ -1236,19 +1241,33 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
}
mm = p->mm;
- mmgrab(mm);
- if (task_will_free_mem(p))
- reap = true;
- else {
- /* Error only if the work has not been done already */
- if (!mm_flags_test(MMF_OOM_SKIP, mm))
+ reap_kill = !!(flags & PROCESS_MRELEASE_REAP_KILL);
+ reap = task_will_free_mem(p, reap_kill);
+ if (!reap) {
+ if (reap_kill || !mm_flags_test(MMF_OOM_SKIP, mm))
ret = -EINVAL;
+
+ task_unlock(p);
+ goto put_task;
}
- task_unlock(p);
- if (!reap)
- goto drop_mm;
+ if (reap_kill) {
+ /*
+ * We use mmget() instead of mmgrab() to keep mm_users > 0,
+ * preventing the victim from calling exit_mmap() in its
+ * own exit path. This ensures that the memory is reclaimed
+ * synchronously and deterministically by the reaper.
+ */
+ mmget(mm);
+ task_unlock(p);
+ ret = kill_pid(task_tgid(task), SIGKILL, 0);
+ if (ret)
+ goto drop_mm;
+ } else {
+ mmgrab(mm);
+ task_unlock(p);
+ }
if (mmap_read_lock_killable(mm)) {
ret = -EINTR;
@@ -1263,7 +1282,10 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
mmap_read_unlock(mm);
drop_mm:
- mmdrop(mm);
+ if (reap_kill)
+ mmput(mm);
+ else
+ mmdrop(mm);
put_task:
put_task_struct(task);
return ret;
--
2.54.0.rc1.555.g9c883467ad-goog
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag
2026-04-21 23:02 ` [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag Minchan Kim
@ 2026-04-24 7:57 ` Michal Hocko
2026-04-24 22:49 ` Minchan Kim
0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2026-04-24 7:57 UTC (permalink / raw)
To: Minchan Kim
Cc: akpm, hca, linux-s390, david, brauner, linux-mm, linux-kernel,
surenb, timmurray, Minchan Kim
On Tue 21-04-26 16:02:39, Minchan Kim wrote:
> Currently, process_mrelease() requires userspace to send a SIGKILL signal
> prior to the call. This separation introduces a scheduling race window
> where the victim task may receive the signal and enter the exit path
> before the reaper can invoke process_mrelease().
>
> When the victim enters the exit path (do_exit -> exit_mm), it clears its
> task->mm immediately. This causes process_mrelease() to fail with -ESRCH,
> leaving the actual address space teardown (exit_mmap) to be deferred until
> the mm's reference count drops to zero. In Android, arbitrary reference counts
> (e.g., async I/O, reading /proc/<pid>/cmdline, or various other remote
> VM accesses) frequently delay this teardown indefinitely, defeating the
> purpose of expedited reclamation.
>
> This delay keeps memory pressure high, forcing the system to unnecessarily
> kill additional innocent background apps before the memory from the first
> victim is recovered.
Thanks, this makes the motivation much more clear and usecase very
sound.
> This patch introduces the PROCESS_MRELEASE_REAP_KILL UAPI flag to support
> an integrated auto-kill mode. When specified, process_mrelease() directly
> injects a SIGKILL into the target task.
>
> To solve the race condition deterministically, we grab the mm reference
> via mmget() and set the MMF_UNSTABLE flag *before* sending the SIGKILL.
> Using mmget() instead of mmgrab() keeps mm_users > 0, preventing the
> victim from calling exit_mmap() in its own exit path.
Why is this needed? Address space tear down is an operation that can run
from several execution contexts.
> This ensures that
> the memory is reclaimed synchronously and deterministically by the reaper
> in the context of process_mrelease(), avoiding delays caused by
> non-deterministic scheduling of the victim task.
The memory is still reclaimed synchronously from the mrelease context.
This is really confusing.
Please also explain why do you need to do all that ugly
task_will_free_mem hoops. Why cannot you simply kill the task if
task_will_free_mem fails (if PROCESS_MRELEASE_REAP_KILL is used).
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag
2026-04-24 7:57 ` Michal Hocko
@ 2026-04-24 22:49 ` Minchan Kim
2026-04-27 7:02 ` Michal Hocko
0 siblings, 1 reply; 21+ messages in thread
From: Minchan Kim @ 2026-04-24 22:49 UTC (permalink / raw)
To: Michal Hocko
Cc: akpm, hca, linux-s390, david, brauner, linux-mm, linux-kernel,
surenb, timmurray
On Fri, Apr 24, 2026 at 09:57:20AM +0200, Michal Hocko wrote:
> On Tue 21-04-26 16:02:39, Minchan Kim wrote:
> > Currently, process_mrelease() requires userspace to send a SIGKILL signal
> > prior to the call. This separation introduces a scheduling race window
> > where the victim task may receive the signal and enter the exit path
> > before the reaper can invoke process_mrelease().
> >
> > When the victim enters the exit path (do_exit -> exit_mm), it clears its
> > task->mm immediately. This causes process_mrelease() to fail with -ESRCH,
> > leaving the actual address space teardown (exit_mmap) to be deferred until
> > the mm's reference count drops to zero. In Android, arbitrary reference counts
> > (e.g., async I/O, reading /proc/<pid>/cmdline, or various other remote
> > VM accesses) frequently delay this teardown indefinitely, defeating the
> > purpose of expedited reclamation.
> >
> > This delay keeps memory pressure high, forcing the system to unnecessarily
> > kill additional innocent background apps before the memory from the first
> > victim is recovered.
>
> Thanks, this makes the motivation much more clear and usecase very
> sound.
>
> > This patch introduces the PROCESS_MRELEASE_REAP_KILL UAPI flag to support
> > an integrated auto-kill mode. When specified, process_mrelease() directly
> > injects a SIGKILL into the target task.
> >
> > To solve the race condition deterministically, we grab the mm reference
> > via mmget() and set the MMF_UNSTABLE flag *before* sending the SIGKILL.
> > Using mmget() instead of mmgrab() keeps mm_users > 0, preventing the
> > victim from calling exit_mmap() in its own exit path.
>
> Why is this needed? Address space tear down is an operation that can run
> from several execution contexts.
Agreed.
>
> > This ensures that
> > the memory is reclaimed synchronously and deterministically by the reaper
> > in the context of process_mrelease(), avoiding delays caused by
> > non-deterministic scheduling of the victim task.
>
> The memory is still reclaimed synchronously from the mrelease context.
> This is really confusing.
>
> Please also explain why do you need to do all that ugly
> task_will_free_mem hoops. Why cannot you simply kill the task if
> task_will_free_mem fails (if PROCESS_MRELEASE_REAP_KILL is used).
I wanted to handle shared address spaces.
Even though we are okay with the target task not being in a SIGKILL
state yet (since we are about to kill it), we must ensure that all
*other* processes sharing the same mm are also dying.
If we simply bypass the check and force a kill when there are living sharers,
the memory will NOT be freed even after the target task dies because
the other processes still pin the mm.
So, to address this, I think we need to modify task_will_free_mem() slightly
to ignore the exit state of the *target* task only, while still checking that
all *other* sharing processes are dying:
static bool task_will_free_mem(struct task_struct *task, bool ignore_exit)
{
...
/* ignore tarket task's signal state */
if (!__task_will_free_mem(task, ignore_exit))
return false;
/*
* but other processes sharing the mm with target must be exit
* state
*/
for_each_process(p) {
...
if (!__task_will_free_mem(p, false))
return false;
}
...
}
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 3/3] mm: process_mrelease: introduce PROCESS_MRELEASE_REAP_KILL flag
2026-04-24 22:49 ` Minchan Kim
@ 2026-04-27 7:02 ` Michal Hocko
0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2026-04-27 7:02 UTC (permalink / raw)
To: Minchan Kim
Cc: akpm, hca, linux-s390, david, brauner, linux-mm, linux-kernel,
surenb, timmurray
On Fri 24-04-26 15:49:19, Minchan Kim wrote:
> On Fri, Apr 24, 2026 at 09:57:20AM +0200, Michal Hocko wrote:
> > On Tue 21-04-26 16:02:39, Minchan Kim wrote:
> > > Currently, process_mrelease() requires userspace to send a SIGKILL signal
> > > prior to the call. This separation introduces a scheduling race window
> > > where the victim task may receive the signal and enter the exit path
> > > before the reaper can invoke process_mrelease().
> > >
> > > When the victim enters the exit path (do_exit -> exit_mm), it clears its
> > > task->mm immediately. This causes process_mrelease() to fail with -ESRCH,
> > > leaving the actual address space teardown (exit_mmap) to be deferred until
> > > the mm's reference count drops to zero. In Android, arbitrary reference counts
> > > (e.g., async I/O, reading /proc/<pid>/cmdline, or various other remote
> > > VM accesses) frequently delay this teardown indefinitely, defeating the
> > > purpose of expedited reclamation.
> > >
> > > This delay keeps memory pressure high, forcing the system to unnecessarily
> > > kill additional innocent background apps before the memory from the first
> > > victim is recovered.
> >
> > Thanks, this makes the motivation much more clear and usecase very
> > sound.
> >
> > > This patch introduces the PROCESS_MRELEASE_REAP_KILL UAPI flag to support
> > > an integrated auto-kill mode. When specified, process_mrelease() directly
> > > injects a SIGKILL into the target task.
> > >
> > > To solve the race condition deterministically, we grab the mm reference
> > > via mmget() and set the MMF_UNSTABLE flag *before* sending the SIGKILL.
> > > Using mmget() instead of mmgrab() keeps mm_users > 0, preventing the
> > > victim from calling exit_mmap() in its own exit path.
> >
> > Why is this needed? Address space tear down is an operation that can run
> > from several execution contexts.
>
> Agreed.
>
> >
> > > This ensures that
> > > the memory is reclaimed synchronously and deterministically by the reaper
> > > in the context of process_mrelease(), avoiding delays caused by
> > > non-deterministic scheduling of the victim task.
> >
> > The memory is still reclaimed synchronously from the mrelease context.
> > This is really confusing.
> >
> > Please also explain why do you need to do all that ugly
> > task_will_free_mem hoops. Why cannot you simply kill the task if
> > task_will_free_mem fails (if PROCESS_MRELEASE_REAP_KILL is used).
>
> I wanted to handle shared address spaces.
> Even though we are okay with the target task not being in a SIGKILL
> state yet (since we are about to kill it), we must ensure that all
> *other* processes sharing the same mm are also dying.
Then just bail out when the mm is shared accross thread groups, rather
than kill just one of them. Or kill all of them. There is no reason to
play around that on the task_will_free_mem level.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 21+ messages in thread