[RFC PATCH 8/9 v2] mm/memcontrol: Make memory.high tier-aware

Linux cgroups development
 help / color / mirror / Atom feed

From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: linux-mm@kvack.org
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Muchun Song <muchun.song@linux.dev>,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-team@meta.com
Subject: [RFC PATCH 8/9 v2] mm/memcontrol: Make memory.high tier-aware
Date: Thu, 23 Apr 2026 13:34:42 -0700	[thread overview]
Message-ID: <20260423203445.2914963-9-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20260423203445.2914963-1-joshua.hahnjy@gmail.com>

On machines serving multiple workloads whose memory is isolated via the
memory cgroup controller, it is currently impossible to enforce a fair
distribution of toptier memory among the workloads, as the limits only
enforce total memory footprint, but not where that memory resides.

This makes ensuring consistent baseline performance difficult, as each
workload's performance is heavily impacted by workload-external factors
such as which other workloads are co-located in the same host, and the
order in which the workloads are started.

Extend the existing memory.high protection to be tier-aware.

Depending on the combination of limit breaches, selectively reclaim on
toptier nodes: when memory.high is breached, perform reclaim on all
nodes. When memory.high is safe but toptier.high is breached, perform
targeted reclaim on toptier nodes only.

Also, throttle allocations when toptier is breached as well, making sure
not to double-penalize when both toptier and memory limits are met.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 mm/memcontrol.c | 82 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 72 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b115ff40e268d..e5f39830d250d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2112,10 +2112,25 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 
 	do {
 		unsigned long pflags;
+		nodemask_t toptier_nodes;
+		nodemask_t *reclaim_targets = NULL;
 
 		if (page_counter_read(&memcg->memory) <=
-		    READ_ONCE(memcg->memory.high))
-			continue;
+		    READ_ONCE(memcg->memory.high)) {
+			if (!mem_cgroup_tiered_limits())
+				continue;
+
+			/*
+			 * Even if the memcg is under the memory limit, toptier
+			 * may have breached the toptier limit. Engage
+			 * targeted reclaim on toptier nodes if so.
+			 */
+			if (page_counter_read(&memcg->toptier) <=
+			    READ_ONCE(memcg->toptier.high))
+				continue;
+			get_toptier_nodemask(&toptier_nodes);
+			reclaim_targets = &toptier_nodes;
+		}
 
 		memcg_memory_event(memcg, MEMCG_HIGH);
 
@@ -2123,7 +2138,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
 							MEMCG_RECLAIM_MAY_SWAP,
-							NULL, NULL);
+							NULL, reclaim_targets);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2224,6 +2239,23 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg)
 	return max_overage;
 }
 
+static u64 toptier_find_max_overage(struct mem_cgroup *memcg)
+{
+	u64 overage, max_overage = 0;
+
+	if (!mem_cgroup_tiered_limits())
+		return 0;
+
+	do {
+		overage = calculate_overage(page_counter_read(&memcg->toptier),
+					    READ_ONCE(memcg->toptier.high));
+		max_overage = max(overage, max_overage);
+	} while ((memcg = parent_mem_cgroup(memcg)) &&
+		  !mem_cgroup_is_root(memcg));
+
+	return max_overage;
+}
+
 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
 {
 	u64 overage, max_overage = 0;
@@ -2326,6 +2358,14 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
 	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
 					       mem_find_max_overage(memcg));
 
+	/*
+	 * Don't double-penalize for toptier high overage if memory.high
+	 * overage penalization has already been accounted for.
+	 */
+	if (!penalty_jiffies)
+		penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+					toptier_find_max_overage(memcg));
+
 	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
 						swap_find_max_overage(memcg));
 
@@ -2522,22 +2562,26 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	do {
 		bool mem_high, swap_high;
+		bool toptier_high = false;
 
 		mem_high = page_counter_read(&memcg->memory) >
 			READ_ONCE(memcg->memory.high);
 		swap_high = page_counter_read(&memcg->swap) >
 			READ_ONCE(memcg->swap.high);
+		toptier_high = mem_cgroup_tiered_limits() &&
+			       page_counter_read(&memcg->toptier) >
+			       READ_ONCE(memcg->toptier.high);
 
 		/* Don't bother a random interrupted task */
 		if (!in_task()) {
-			if (mem_high) {
+			if (mem_high || toptier_high) {
 				schedule_work(&memcg->high_work);
 				break;
 			}
 			continue;
 		}
 
-		if (mem_high || swap_high) {
+		if (mem_high || swap_high || toptier_high) {
 			/*
 			 * The allocating tasks in this cgroup will need to do
 			 * reclaim or be throttled to prevent further growth
@@ -4577,10 +4621,28 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
-		unsigned long reclaimed;
+		unsigned long reclaimed, charge;
+		nodemask_t toptier_nodes;
+		nodemask_t *reclaim_targets = NULL;
 
-		if (nr_pages <= high)
-			break;
+		if (nr_pages <= high) {
+			unsigned long toptier_nr_pages, toptier_high;
+
+			if (!mem_cgroup_tiered_limits())
+				break;
+
+			toptier_nr_pages = page_counter_read(&memcg->toptier);
+			toptier_high = READ_ONCE(memcg->toptier.high);
+
+			if (toptier_nr_pages <= toptier_high)
+				break;
+
+			get_toptier_nodemask(&toptier_nodes);
+			reclaim_targets = &toptier_nodes;
+			charge = toptier_nr_pages - toptier_high;
+		} else {
+			charge = nr_pages - high;
+		}
 
 		if (signal_pending(current))
 			break;
@@ -4591,9 +4653,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 			continue;
 		}
 
-		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+		reclaimed = try_to_free_mem_cgroup_pages(memcg, charge,
 					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
-					NULL, NULL);
+					NULL, reclaim_targets);
 
 		if (!reclaimed && !nr_retries--)
 			break;
-- 
2.52.0

next prev parent reply	other threads:[~2026-04-23 20:35 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-23 20:34 [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 1/9 v2] cgroup: Introduce memory_tiered_limits cgroup mount option Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 3/9 v2] mm/memcontrol: Refactor page_counter charging in try_charge_memcg Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 5/9 v2] mm/memcontrol: Set toptier limits proportional to memory limits Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 6/9 v2] mm/vmscan, memcontrol: Add nodemask to try_to_free_mem_cgroup_pages Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 7/9 v2] mm/memcontrol: Make memory.low and memory.min tier-aware Joshua Hahn
2026-04-23 20:34 ` Joshua Hahn [this message]
2026-04-23 20:34 ` [RFC PATCH 9/9 v2] mm/memcontrol: Make memory.max tier-aware Joshua Hahn
2026-05-11 15:56 ` [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware David Hildenbrand (Arm)
2026-05-11 20:03   ` Joshua Hahn

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b115ff40e268 dfblob:e5f39830d250 )
 OR (
bs:"[RFC PATCH 8/9 v2] mm/memcontrol: Make memory.high tier-aware" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260423203445.2914963-9-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox