Linux cgroups development
 help / color / mirror / Atom feed
From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: linux-mm@kvack.org
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Muchun Song <muchun.song@linux.dev>,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-team@meta.com
Subject: [RFC PATCH 9/9 v2] mm/memcontrol: Make memory.max tier-aware
Date: Thu, 23 Apr 2026 13:34:43 -0700	[thread overview]
Message-ID: <20260423203445.2914963-10-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20260423203445.2914963-1-joshua.hahnjy@gmail.com>

On machines serving multiple workloads whose memory is isolated via the
memory cgroup controller, it is currently impossible to enforce a fair
distribution of toptier memory among the workloads, as the limits only
enforce total memory footprint, but not where that memory resides.

This makes ensuring consistent baseline performance difficult, as each
workload's performance is heavily impacted by workload-external factors
such as which other workloads are co-located in the same host, and the
order in which the workloads are started.

Extend the existing memory.max protection to be tier-aware.

Depending on the combination of limit breaches, selectively reclaim on
toptier nodes: when memory.max is breached, perform reclaim on all
nodes.  When memory.max is safe but toptier.max is breached, perform
targeted reclaim on toptier nodes only.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e5f39830d250d..d8d67ada993ff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1518,6 +1518,15 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 	if (count < limit)
 		margin = limit - count;
 
+	if (mem_cgroup_tiered_limits()) {
+		count = page_counter_read(&memcg->toptier);
+		limit = READ_ONCE(memcg->toptier.max);
+		if (count < limit)
+			margin = min(margin, limit - count);
+		else
+			margin = 0;
+	}
+
 	if (do_memsw_account()) {
 		count = page_counter_read(&memcg->memsw);
 		limit = READ_ONCE(memcg->memsw.max);
@@ -2424,11 +2433,12 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	bool raised_max_event = false;
 	unsigned long pflags;
 	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
-	bool toptier_charged;
+	nodemask_t toptier_nodes;
+	nodemask_t *reclaim_nodes;
 
 retry:
 	reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
-	toptier_charged = false;
+	reclaim_nodes = NULL;
 
 	if (do_memsw_account() &&
 	    !page_counter_try_charge(&memcg->memsw, nr_pages, &counter)) {
@@ -2438,13 +2448,20 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	}
 
 	if (toptier &&
-	    page_counter_try_charge(&memcg->toptier, nr_pages, &counter))
-		toptier_charged = true;
+	    !page_counter_try_charge(&memcg->toptier, nr_pages, &counter)) {
+		get_toptier_nodemask(&toptier_nodes);
+		reclaim_nodes = &toptier_nodes;
+		mem_over_limit = mem_cgroup_from_counter(counter, toptier);
+
+		if (do_memsw_account())
+			page_counter_uncharge(&memcg->memsw, nr_pages);
+		goto reclaim;
+	}
 
 	if (page_counter_try_charge(&memcg->memory, nr_pages, &counter))
 		goto done_restock;
 
-	if (toptier_charged)
+	if (toptier)
 		page_counter_uncharge(&memcg->toptier, nr_pages);
 	if (do_memsw_account())
 		page_counter_uncharge(&memcg->memsw, nr_pages);
@@ -2473,7 +2490,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, reclaim_options,
-						    NULL, NULL);
+						    NULL, reclaim_nodes);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -4683,7 +4700,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
 	bool drained = false;
-	unsigned long max;
+	unsigned long max, toptier_max = PAGE_COUNTER_MAX;
+	nodemask_t toptier_nodes;
 	int err;
 
 	buf = strstrip(buf);
@@ -4692,16 +4710,30 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 		return err;
 
 	xchg(&memcg->memory.max, max);
-	if (mem_cgroup_tiered_limits())
-		xchg(&memcg->toptier.max, page_counter_max_or_scale(max));
+	if (mem_cgroup_tiered_limits()) {
+		toptier_max = page_counter_max_or_scale(max);
+		xchg(&memcg->toptier.max, toptier_max);
+		get_toptier_nodemask(&toptier_nodes);
+	}
 
 	if (of->file->f_flags & O_NONBLOCK)
 		goto out;
 
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
+		unsigned long nr_toptier = page_counter_read(&memcg->toptier);
+		unsigned long to_reclaim = 0;
+		nodemask_t *reclaim_nodes = NULL;
+
+		if (nr_pages > max) {
+			to_reclaim = nr_pages - max;
+		} else if (mem_cgroup_tiered_limits() &&
+				nr_toptier > toptier_max) {
+			to_reclaim = nr_toptier - toptier_max;
+			reclaim_nodes = &toptier_nodes;
+		}
 
-		if (nr_pages <= max)
+		if (!to_reclaim)
 			break;
 
 		if (signal_pending(current))
@@ -4714,9 +4746,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 		}
 
 		if (nr_reclaims) {
-			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+			if (!try_to_free_mem_cgroup_pages(memcg, to_reclaim,
 					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
-					NULL, NULL))
+					NULL, reclaim_nodes))
 				nr_reclaims--;
 			continue;
 		}
-- 
2.52.0


  parent reply	other threads:[~2026-04-23 20:35 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-23 20:34 [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 1/9 v2] cgroup: Introduce memory_tiered_limits cgroup mount option Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 3/9 v2] mm/memcontrol: Refactor page_counter charging in try_charge_memcg Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 5/9 v2] mm/memcontrol: Set toptier limits proportional to memory limits Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 6/9 v2] mm/vmscan, memcontrol: Add nodemask to try_to_free_mem_cgroup_pages Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 7/9 v2] mm/memcontrol: Make memory.low and memory.min tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 8/9 v2] mm/memcontrol: Make memory.high tier-aware Joshua Hahn
2026-04-23 20:34 ` Joshua Hahn [this message]
2026-05-11 15:56 ` [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware David Hildenbrand (Arm)
2026-05-11 20:03   ` Joshua Hahn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260423203445.2914963-10-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox