[RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup

Linux cgroups development
 help / color / mirror / Atom feed

From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: linux-mm@kvack.org
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Muchun Song <muchun.song@linux.dev>,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-team@meta.com
Subject: [RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup
Date: Thu, 23 Apr 2026 13:34:38 -0700	[thread overview]
Message-ID: <20260423203445.2914963-5-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20260423203445.2914963-1-joshua.hahnjy@gmail.com>

Memory cgroup limits currently offer a way to isolate memory as a
resource, but treats the cost/value of all memory to be equal,
regardless of whether it is present in a toptier node or not.

To better capture the asymmetric utility of toptier memory from
"lowtier" memory, account toptier memory usage in parallel to existing
memory accounting mechanisms. To do this, introduce a new page_counter
"toptier" to mem_cgroup.

From a simplified perspective, we can achieve this by checking the
physical location of folios when the memory page_counter is updated, and
decide whether to also account to toptier. Add a new "toptier" parameter
to try_charge_memcg(), which callers must determine.

However, as of this RFC, this simplified model only works on LRU folios
(callers of try_charge_memcg() from charge_memcg()). The other two
sites, obj_cgroup_charge_pages() and mem_cgroup_sk_charge(), will be
addressed in future patches that transition enum memcg_stat_item to
a per-lruvec counter (enum memcg_stat_item).

Enforcement mechanisms are not present at this point. Failing the
toptier limit check leads to nothing, but the charges are accumulated.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            | 63 ++++++++++++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index be45641e890e4..0cdb6cd1955dc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,7 @@ struct mem_cgroup {
 
 	/* Accounted resources */
 	struct page_counter memory;		/* Both v1 & v2 */
+	struct page_counter toptier;		/* v2 only */
 
 	union {
 		struct page_counter swap;	/* v2 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f7bedb55dbb1..d891cf77cf6d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -53,6 +53,7 @@
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/memremap.h>
+#include <linux/memory-tiers.h>
 #include <linux/mm_inline.h>
 #include <linux/swap_cgroup.h>
 #include <linux/cpu.h>
@@ -2096,6 +2097,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 
 	for_each_mem_cgroup(memcg) {
 		page_counter_drain_cpu(&memcg->memory, cpu);
+		page_counter_drain_cpu(&memcg->toptier, cpu);
 		page_counter_drain_cpu(&memcg->memsw, cpu);
 	}
 
@@ -2370,7 +2372,7 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
 }
 
 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
-			    unsigned int nr_pages)
+			    unsigned int nr_pages, bool toptier)
 {
 	int nr_retries = MAX_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
@@ -2382,9 +2384,11 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	bool raised_max_event = false;
 	unsigned long pflags;
 	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
+	bool toptier_charged;
 
 retry:
 	reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+	toptier_charged = false;
 
 	if (do_memsw_account() &&
 	    !page_counter_try_charge(&memcg->memsw, nr_pages, &counter)) {
@@ -2393,11 +2397,18 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		goto reclaim;
 	}
 
+	if (toptier &&
+	    page_counter_try_charge(&memcg->toptier, nr_pages, &counter))
+		toptier_charged = true;
+
 	if (page_counter_try_charge(&memcg->memory, nr_pages, &counter))
 		goto done_restock;
 
+	if (toptier_charged)
+		page_counter_uncharge(&memcg->toptier, nr_pages);
 	if (do_memsw_account())
 		page_counter_uncharge(&memcg->memsw, nr_pages);
+
 	mem_over_limit = mem_cgroup_from_counter(counter, memory);
 
 reclaim:
@@ -2490,6 +2501,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * being freed very soon.  Allow memory usage go over the limit
 	 * temporarily by force charging it.
 	 */
+	if (toptier)
+		page_counter_charge(&memcg->toptier, nr_pages);
 	page_counter_charge(&memcg->memory, nr_pages);
 	if (do_memsw_account())
 		page_counter_charge(&memcg->memsw, nr_pages);
@@ -2559,7 +2572,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (mem_cgroup_is_root(memcg))
 		return 0;
 
-	return try_charge_memcg(memcg, gfp_mask, nr_pages);
+	return try_charge_memcg(memcg, gfp_mask, nr_pages, false);
 }
 
 static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
@@ -2859,7 +2872,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
 
 	memcg = get_mem_cgroup_from_objcg(objcg);
 
-	ret = try_charge_memcg(memcg, gfp, nr_pages);
+	ret = try_charge_memcg(memcg, gfp, nr_pages, false);
 	if (ret)
 		goto out;
 
@@ -2888,6 +2901,11 @@ static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
 	page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
 }
 
+static bool should_charge_toptier(struct folio *folio)
+{
+	return mem_cgroup_tiered_limits() && node_is_toptier(folio_nid(folio));
+}
+
 /**
  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
  * @page: page to charge
@@ -3760,6 +3778,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	page_counter_free_stock(&memcg->memory);
+	page_counter_free_stock(&memcg->toptier);
 	page_counter_free_stock(&memcg->memsw);
 	lru_gen_exit_memcg(memcg);
 	memcg_wb_domain_exit(memcg);
@@ -3866,6 +3885,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 
 		page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
+		page_counter_init(&memcg->toptier, &parent->toptier, memcg_on_dfl);
 		page_counter_init(&memcg->swap, &parent->swap, false);
 #ifdef CONFIG_MEMCG_V1
 		memcg->memory.track_failcnt = !memcg_on_dfl;
@@ -3877,6 +3897,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		init_memcg_stats();
 		init_memcg_events();
 		page_counter_init(&memcg->memory, NULL, true);
+		page_counter_init(&memcg->toptier, NULL, true);
 		page_counter_init(&memcg->swap, NULL, false);
 #ifdef CONFIG_MEMCG_V1
 		page_counter_init(&memcg->kmem, NULL, false);
@@ -3936,6 +3957,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
 	/* failure is nonfatal, charges fall back to direct hierarchy */
 	page_counter_enable_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+	page_counter_enable_stock(&memcg->toptier, MEMCG_CHARGE_BATCH);
 	if (do_memsw_account())
 		page_counter_enable_stock(&memcg->memsw, MEMCG_CHARGE_BATCH);
 
@@ -4013,6 +4035,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	drain_all_stock(memcg);
 	page_counter_disable_stock(&memcg->memory);
+	page_counter_disable_stock(&memcg->toptier);
 	page_counter_disable_stock(&memcg->memsw);
 
 	mem_cgroup_private_id_put(memcg, 1);
@@ -4825,7 +4848,8 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 	objcg = get_obj_cgroup_from_memcg(memcg);
 	/* Do not account at the root objcg level. */
 	if (!obj_cgroup_is_root(objcg))
-		ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio));
+		ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio),
+				       should_charge_toptier(folio));
 	if (ret) {
 		obj_cgroup_put(objcg);
 		return ret;
@@ -4922,6 +4946,7 @@ struct uncharge_gather {
 	unsigned long nr_memory;
 	unsigned long pgpgout;
 	unsigned long nr_kmem;
+	unsigned long nr_toptier;
 	int nid;
 };
 
@@ -4942,6 +4967,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 			mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
 			memcg1_account_kmem(memcg, -ug->nr_kmem);
 		}
+		if (ug->nr_toptier)
+			page_counter_uncharge(&memcg->toptier, ug->nr_toptier);
 		memcg1_oom_recover(memcg);
 	}
 
@@ -4987,8 +5014,11 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 		ug->nr_kmem += nr_pages;
 	} else {
 		/* LRU pages aren't accounted at the root level */
-		if (!obj_cgroup_is_root(objcg))
+		if (!obj_cgroup_is_root(objcg)) {
 			ug->nr_memory += nr_pages;
+			if (should_charge_toptier(folio))
+				ug->nr_toptier += nr_pages;
+		}
 		ug->pgpgout++;
 
 		WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
@@ -5063,6 +5093,10 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
 		page_counter_charge(&memcg->memory, nr_pages);
 		if (do_memsw_account())
 			page_counter_charge(&memcg->memsw, nr_pages);
+
+		/* old folio's toptier usage will be uncharged on free */
+		if (should_charge_toptier(new))
+			page_counter_charge(&memcg->toptier, nr_pages);
 	}
 
 	obj_cgroup_get(objcg);
@@ -5105,6 +5139,23 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
 	if (!objcg)
 		return;
 
+	if (!obj_cgroup_is_root(objcg)) {
+		struct mem_cgroup *memcg;
+		unsigned long nr_pages = folio_nr_pages(old);
+		bool old_toptier, new_toptier;
+
+		rcu_read_lock();
+		memcg = obj_cgroup_memcg(objcg);
+		old_toptier = should_charge_toptier(old);
+		new_toptier = should_charge_toptier(new);
+
+		if (old_toptier && !new_toptier)
+			page_counter_uncharge(&memcg->toptier, nr_pages);
+		else if (!old_toptier && new_toptier)
+			page_counter_charge(&memcg->toptier, nr_pages);
+		rcu_read_unlock();
+	}
+
 	/* Transfer the charge and the objcg ref */
 	commit_charge(new, objcg);
 
@@ -5180,7 +5231,7 @@ bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
 
-	if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
+	if (try_charge_memcg(memcg, gfp_mask, nr_pages, false) == 0) {
 		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
 		return true;
 	}
-- 
2.52.0

next prev parent reply	other threads:[~2026-04-23 20:34 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-23 20:34 [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 1/9 v2] cgroup: Introduce memory_tiered_limits cgroup mount option Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 3/9 v2] mm/memcontrol: Refactor page_counter charging in try_charge_memcg Joshua Hahn
2026-04-23 20:34 ` Joshua Hahn [this message]
2026-04-23 20:34 ` [RFC PATCH 5/9 v2] mm/memcontrol: Set toptier limits proportional to memory limits Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 6/9 v2] mm/vmscan, memcontrol: Add nodemask to try_to_free_mem_cgroup_pages Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 7/9 v2] mm/memcontrol: Make memory.low and memory.min tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 8/9 v2] mm/memcontrol: Make memory.high tier-aware Joshua Hahn
2026-04-23 20:34 ` [RFC PATCH 9/9 v2] mm/memcontrol: Make memory.max tier-aware Joshua Hahn
2026-05-11 15:56 ` [RFC PATCH 0/9 v2] mm/memcontrol: Make memory cgroup limits tier-aware David Hildenbrand (Arm)
2026-05-11 20:03   ` Joshua Hahn

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:be45641e890e dfblob:0cdb6cd1955d dfblob:8f7bedb55dbb
dfblob:d891cf77cf6d )
 OR (
bs:"[RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260423203445.2914963-5-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox