From: Bing Jiao <bingjiao@google.com>
To: linux-mm@kvack.org
Cc: Bing Jiao <bingjiao@google.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Roman Gushchin <roman.gushchin@linux.dev>,
Shakeel Butt <shakeel.butt@linux.dev>,
Muchun Song <muchun.song@linux.dev>,
Andrew Morton <akpm@linux-foundation.org>,
David Rientjes <rientjes@google.com>,
Yosry Ahmed <yosry@kernel.org>,
cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
Barry Song <baohua@kernel.org>,
Youngjun Park <youngjun.park@lge.com>,
David Hildenbrand <david@kernel.org>,
Qi Zheng <zhengqi.arch@bytedance.com>,
Lorenzo Stoakes <ljs@kernel.org>,
Axel Rasmussen <axelrasmussen@google.com>,
Yuanchu Xie <yuanchu@google.com>, Wei Xu <weixugc@google.com>,
Joshua Hahn <joshua.hahnjy@gmail.com>
Subject: [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim
Date: Tue, 17 Mar 2026 23:07:01 +0000 [thread overview]
Message-ID: <20260317230720.990329-3-bingjiao@google.com> (raw)
In-Reply-To: <20260317230720.990329-1-bingjiao@google.com>
NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
it does not reduce the total memory usage of a memcg. In memcg direct
reclaim paths (e.g., charge-triggered or manual limit writes), where
demotion is allowed, this leads to "fake progress" where the reclaim
loop concludes it has satisfied the memory request without actually
reducing the cgroup's charge.
This could result in inefficient reclaim loops, CPU waste, moving all
pages to far-tier nodes, and potentially premature OOM kills when the
cgroup is under memory pressure but demotion is still possible.
Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
these memcg-specific reclaim paths. This ensures that reclaim
progress is only counted when memory is actually freed or swapped out.
Signed-off-by: Bing Jiao <bingjiao@google.com>
---
include/linux/swap.h | 1 +
mm/memcontrol-v1.c | 10 ++++++++--
mm/memcontrol.c | 16 +++++++++++-----
mm/vmscan.c | 1 +
4 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7a09df6977a5..e83897a6dc72 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 433bba9dfe71..3cb600e28e5b 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
+ unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
+
+ if (!memsw)
+ reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
do {
if (signal_pending(current)) {
@@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
+ reclaim_options, NULL)) {
ret = -EBUSY;
break;
}
@@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP, NULL))
+ reclaim_options, NULL))
nr_retries--;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 303ac622d22d..fcf1cd0da643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
do {
unsigned long pflags;
@@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP,
+ reclaim_options,
NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
@@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options, NULL);
+ gfp_mask, reclaim_options, NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
bool drained = false;
unsigned long high;
int err;
@@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
+ GFP_KERNEL, reclaim_options, NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
bool drained = false;
unsigned long max;
int err;
@@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
+ GFP_KERNEL, reclaim_options, NULL))
nr_reclaims--;
continue;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33287ba4a500..7a8617ba1748 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
--
2.53.0.851.ga537e3e6e9-goog
next prev parent reply other threads:[~2026-03-17 23:07 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-17 23:06 [PATCH 0/3] mm/memcontrol: control demotion in memcg reclaim Bing Jiao
2026-03-17 23:07 ` [PATCH 1/3] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-17 23:38 ` Yosry Ahmed
2026-03-17 23:07 ` Bing Jiao [this message]
2026-03-17 23:44 ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Yosry Ahmed
2026-03-18 20:57 ` Bing Jiao
2026-03-18 21:56 ` [PATCH v2] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-18 22:06 ` Yosry Ahmed
2026-03-18 22:19 ` [PATCH v3] " Bing Jiao
2026-03-18 22:54 ` Johannes Weiner
2026-03-18 23:28 ` Shakeel Butt
2026-03-19 9:29 ` Michal Hocko
2026-03-20 3:39 ` Bing Jiao
2026-03-20 9:32 ` Michal Hocko
2026-03-21 3:34 ` [PATCH v4] " Bing Jiao
2026-03-20 13:17 ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Donet Tom
2026-03-21 4:04 ` Bing Jiao
2026-03-17 23:07 ` [PATCH 3/3] mm/vmscan: add demote= option to proactive reclaim Bing Jiao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260317230720.990329-3-bingjiao@google.com \
--to=bingjiao@google.com \
--cc=akpm@linux-foundation.org \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=cgroups@vger.kernel.org \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=weixugc@google.com \
--cc=yosry@kernel.org \
--cc=youngjun.park@lge.com \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox