* [PATCH v4 1/5] mm/page_counter: introduce per-page_counter stock
2026-06-23 18:01 [PATCH v4 0/5] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
@ 2026-06-23 18:01 ` Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 2/5] mm/memcontrol: flatten try_charge_memcg control flow Joshua Hahn
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-06-23 18:01 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko
Cc: Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, cgroups,
linux-mm, linux-kernel, kernel-team
In order to avoid expensive hierarchy walks on every memcg charge and
limit check, memcontrol uses per-cpu stocks (memcg_stock_pcp) to cache
pre-charged pages and introduce a fast path to try_charge_memcg.
However, there are a few quirks with the current implementation that
could be improved upon.
First, each memcg_stock_pcp can only cache the charges of 7 memcgs
(defined as NR_MEMCG_STOCK), which means that once a CPU starts handling
the charging of more than 7 memcgs, it randomly selects a victim memcg
to evict and drain from the cpu, which can cause unnecessarily increased
latencies and thrashing as memcgs continually evict each other's stock.
Flushing a memcg's stock on a CPU also means that all other stock
present on that CPU is also flushed, leading to poor caching for systems
running multiple memcgs competing for the same CPUs.
Finally, stock is tightly coupled with memcg, which means that all page
counters in a memcg share the same resource. This may simplify some of
the charging logic, but it prevents new page counters from being added
and using a separate stock.
We can address these concerns by pushing the concept of stock down to
the page_counter level, which addresses the random eviction problem by
getting rid of the 7 slot limit, and makes enabling separate stock
caches for other page_counters simpler.
Introduce a generic per-cpu stock directly in struct page_counter.
Stock can optionally be enabled per-page_counter, limiting the overhead
increase for page_counters who do not benefit greatly from caching
charges.
In this scheme, stock usage and refills happen via lockless atomic
operations, eliminating the need for asynchronous workqueues as well.
In this commit we introduce the alloc, free, and drain operations,
although they are unused for now.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/page_counter.h | 15 +++++++++++++
mm/page_counter.c | 42 ++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc871..4abc7fe7c3494 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -5,8 +5,17 @@
#include <linux/atomic.h>
#include <linux/cache.h>
#include <linux/limits.h>
+#include <linux/percpu.h>
#include <asm/page.h>
+struct page_counter_stock {
+ /*
+ * Consumption/refills can only come from the owning cpu via
+ * atomic_cmpxchg. Remote access only happens on drain via atomic_xchg.
+ */
+ atomic_t nr_pages;
+};
+
struct page_counter {
/*
* Make sure 'usage' does not share cacheline with any other field in
@@ -41,6 +50,8 @@ struct page_counter {
unsigned long high;
unsigned long max;
struct page_counter *parent;
+ struct page_counter_stock __percpu *stock;
+ unsigned int batch;
} ____cacheline_internodealigned_in_smp;
#if BITS_PER_LONG == 32
@@ -99,6 +110,10 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
counter->watermark = usage;
}
+void page_counter_drain_stock(struct page_counter *counter, unsigned int cpu);
+int page_counter_alloc_stock(struct page_counter *counter, unsigned int batch);
+void page_counter_free_stock(struct page_counter *counter);
+
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
void page_counter_calculate_protection(struct page_counter *root,
struct page_counter *counter,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127a..6bb48a913a90d 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -8,6 +8,7 @@
#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
+#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
@@ -289,6 +290,47 @@ int page_counter_memparse(const char *buf, const char *max,
return 0;
}
+void page_counter_drain_stock(struct page_counter *counter, unsigned int cpu)
+{
+ struct page_counter_stock *stock;
+ int nr_pages;
+
+ if (!counter->stock)
+ return;
+
+ stock = per_cpu_ptr(counter->stock, cpu);
+ nr_pages = atomic_xchg(&stock->nr_pages, 0);
+ if (nr_pages)
+ page_counter_uncharge(counter, nr_pages);
+}
+
+int page_counter_alloc_stock(struct page_counter *counter, unsigned int batch)
+{
+ struct page_counter_stock __percpu *stock;
+
+ stock = alloc_percpu(struct page_counter_stock);
+ if (!stock)
+ return -ENOMEM;
+
+ counter->stock = stock;
+ counter->batch = batch;
+
+ return 0;
+}
+
+void page_counter_free_stock(struct page_counter *counter)
+{
+ int cpu;
+
+ if (!counter->stock)
+ return;
+
+ for_each_possible_cpu(cpu)
+ page_counter_drain_stock(counter, cpu);
+
+ free_percpu(counter->stock);
+ counter->stock = NULL;
+}
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v4 2/5] mm/memcontrol: flatten try_charge_memcg control flow
2026-06-23 18:01 [PATCH v4 0/5] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 1/5] mm/page_counter: introduce per-page_counter stock Joshua Hahn
@ 2026-06-23 18:01 ` Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 3/5] mm/page_counter: introduce page_counter_try_charge_stock() Joshua Hahn
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-06-23 18:01 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko
Cc: Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, cgroups,
linux-mm, linux-kernel, kernel-team
Refactor try_charge_memcg by flattening the nested memsw/memory
page_counter operations to separate the logic between the two.
When page_counter_try_charge is made stock-aware, this flattening makes
the control flow easier to follow since each page counter now has its
own success/failure paths.
No functional changes intended.
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 56cd4af082326..306658fd55512 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2616,18 +2616,21 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
batch = nr_pages;
reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
- if (!do_memsw_account() ||
- page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (page_counter_try_charge(&memcg->memory, batch, &counter))
- goto done_restock;
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, batch);
- mem_over_limit = mem_cgroup_from_counter(counter, memory);
- } else {
+ if (do_memsw_account() &&
+ !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
+ goto reclaim;
}
+ if (page_counter_try_charge(&memcg->memory, batch, &counter))
+ goto done_restock;
+
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, batch);
+ mem_over_limit = mem_cgroup_from_counter(counter, memory);
+
+reclaim:
if (batch > nr_pages) {
batch = nr_pages;
goto retry;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v4 3/5] mm/page_counter: introduce page_counter_try_charge_stock()
2026-06-23 18:01 [PATCH v4 0/5] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 1/5] mm/page_counter: introduce per-page_counter stock Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 2/5] mm/memcontrol: flatten try_charge_memcg control flow Joshua Hahn
@ 2026-06-23 18:01 ` Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock Joshua Hahn
2026-06-23 18:01 ` [PATCH v4 5/5] mm/memcontrol: remove unused memcg_stock code Joshua Hahn
4 siblings, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-06-23 18:01 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko
Cc: Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, cgroups,
linux-mm, linux-kernel, kernel-team
Add a stock-aware variant of page_counter_try_charge.
As before with try_charge_memcg, it first tries to satisfy the charge by
consuming the per-cpu stock (and skipping the hierarchical charge). On a
miss, it tries to greedily overcharge up to counter->batch pages to
refill the stock. Finally, if this fails, it tries to charge exactly the
requested number of pages.
The number of pages that were charged to the page_counter is reported
back to the caller, so that stock hits don't trigger memory limit
checks.
The variant is unused for now; memcg is converted in a later patch.
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/page_counter.h | 4 +++
mm/page_counter.c | 48 ++++++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 4abc7fe7c3494..b97b5491447e4 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -84,6 +84,10 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
+bool page_counter_try_charge_stock(struct page_counter *counter,
+ unsigned long nr_pages,
+ struct page_counter **fail,
+ unsigned long *nr_charged);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 6bb48a913a90d..cce3af3f19e03 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -172,6 +172,54 @@ bool page_counter_try_charge(struct page_counter *counter,
return false;
}
+bool page_counter_try_charge_stock(struct page_counter *counter,
+ unsigned long nr_pages,
+ struct page_counter **fail,
+ unsigned long *nr_charged)
+{
+ struct page_counter_stock *stock;
+ unsigned long charge = 0;
+ int old;
+
+ if (!counter->stock)
+ goto charge_exact;
+
+ preempt_disable();
+ stock = this_cpu_ptr(counter->stock);
+ old = atomic_read(&stock->nr_pages);
+ while ((unsigned long)old >= nr_pages) {
+ if (atomic_try_cmpxchg(&stock->nr_pages, &old,
+ old - (int)nr_pages)) {
+ preempt_enable();
+ goto out_success;
+ }
+ }
+ preempt_enable();
+
+ charge = max_t(unsigned long, READ_ONCE(counter->batch), nr_pages);
+ if (charge <= nr_pages)
+ goto charge_exact;
+
+ if (page_counter_try_charge(counter, charge, fail)) {
+ preempt_disable();
+ stock = this_cpu_ptr(counter->stock);
+ atomic_add((int)(charge - nr_pages), &stock->nr_pages);
+ preempt_enable();
+ goto out_success;
+ }
+
+charge_exact:
+ /* stock is not enabled, no need for surplus, or greedy charge failed */
+ charge = nr_pages;
+ if (!page_counter_try_charge(counter, charge, fail))
+ return false;
+
+out_success:
+ if (nr_charged)
+ *nr_charged = charge;
+ return true;
+}
+
/**
* page_counter_uncharge - hierarchically uncharge pages
* @counter: counter
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock
2026-06-23 18:01 [PATCH v4 0/5] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (2 preceding siblings ...)
2026-06-23 18:01 ` [PATCH v4 3/5] mm/page_counter: introduce page_counter_try_charge_stock() Joshua Hahn
@ 2026-06-23 18:01 ` Joshua Hahn
2026-06-24 14:43 ` Usama Arif
2026-06-23 18:01 ` [PATCH v4 5/5] mm/memcontrol: remove unused memcg_stock code Joshua Hahn
4 siblings, 1 reply; 10+ messages in thread
From: Joshua Hahn @ 2026-06-23 18:01 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko
Cc: Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, cgroups,
linux-mm, linux-kernel, kernel-team
Now with all of the memcg_stock handling logic replicated in
page_counter_stock, switch memcg to use the page_counter_stock for the
memory (and for cgroup v1 users, memsw) page_counters.
There are a few details that have changed:
First, the old special-casing for the !allow_spinning check to avoid
refilling and flushing of the old stock is removed. This special casing
was important previously, because refilling the stock could do a lot of
extra work by evicting one of 7 random victim memcgs in the percpu
memcg_stock slots. In the new per-counter design, refilling stock just
adds pages to the counter's own local cache without affecting other memcgs,
so the original reason for the special case no longer applies.
Also, we can now fail during page_counter_alloc_stock(), if there is
not enough memory to allocate a percpu page_counter_stock. This failure
is rare and nonfatal; the system can continue to operate, with the page
counter working without stock and falling back to walking the hierarchy.
drain_all_stock and memcg_hotplug_cpu_dead also now use the page_counter
stock drain variant, which uses remote atomic_xchg to retrieve stock
across CPUs, instead of scheduling asynchronous work.
Finally, as a side-effect of separating the per-memcg stock to per-
page_counter, the memsw and memory page_counters have independent stock.
This means that the reported memsw may transiently be lower than memory
usage if the stock for memory and memsw page_counters go out of sync.
Note that obj_stock is untouched by this change.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 87 +++++++++++++++++++++++--------------------------
1 file changed, 41 insertions(+), 46 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 306658fd55512..846800917af49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2269,39 +2269,36 @@ static void schedule_drain_work(int cpu, struct work_struct *work)
queue_work_on(cpu, memcg_wq, work);
}
+static void memcg_drain_stock(struct mem_cgroup *memcg, int cpu)
+{
+ page_counter_drain_stock(&memcg->memory, cpu);
+ if (do_memsw_account())
+ page_counter_drain_stock(&memcg->memsw, cpu);
+}
+
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
*/
void drain_all_stock(struct mem_cgroup *root_memcg)
{
+ struct mem_cgroup *memcg;
int cpu, curcpu;
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
return;
- /*
- * Notify other cpus that system-wide "drain" is running
- * We do not care about races with the cpu hotplug because cpu down
- * as well as workers from this path always operate on the local
- * per-cpu data. CPU up doesn't touch memcg_stock at all.
- */
+
+ for_each_mem_cgroup_tree(memcg, root_memcg) {
+ for_each_online_cpu(cpu)
+ memcg_drain_stock(memcg, cpu);
+ }
+
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
- is_memcg_drain_needed(memcg_st, root_memcg) &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE,
- &memcg_st->flags)) {
- if (cpu == curcpu)
- drain_local_memcg_stock(&memcg_st->work);
- else
- schedule_drain_work(cpu, &memcg_st->work);
- }
-
if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
obj_stock_flush_required(obj_st, root_memcg) &&
!test_and_set_bit(FLUSHING_CACHED_CHARGE,
@@ -2318,9 +2315,13 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
+ struct mem_cgroup *memcg;
+
/* no need for the local lock */
drain_obj_stock(&per_cpu(obj_stock, cpu));
- drain_stock_fully(&per_cpu(memcg_stock, cpu));
+
+ for_each_mem_cgroup(memcg)
+ memcg_drain_stock(memcg, cpu);
return 0;
}
@@ -2595,7 +2596,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -2606,36 +2606,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool raised_max_event = false;
unsigned long pflags;
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
+ unsigned long nr_charged = 0;
retry:
- if (consume_stock(memcg, nr_pages))
- return 0;
-
- if (!allow_spinning)
- /* Avoid the refill and flush of the older stock */
- batch = nr_pages;
-
reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (do_memsw_account() &&
- !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+ !page_counter_try_charge_stock(&memcg->memsw, nr_pages,
+ &counter, NULL)) {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
goto reclaim;
}
- if (page_counter_try_charge(&memcg->memory, batch, &counter))
- goto done_restock;
+ if (page_counter_try_charge_stock(&memcg->memory, nr_pages,
+ &counter, &nr_charged)) {
+ if (!nr_charged)
+ return 0;
+ goto handle_high;
+ }
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, batch);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
reclaim:
- if (batch > nr_pages) {
- batch = nr_pages;
- goto retry;
- }
-
/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
@@ -2731,10 +2725,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
return 0;
-done_restock:
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
-
+handle_high:
/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
@@ -2771,7 +2762,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* and distribute reclaim work and delay penalties
* based on how much each task is actually allocating.
*/
- current->memcg_nr_pages_over_high += batch;
+ current->memcg_nr_pages_over_high += nr_charged;
set_notify_resume(current);
break;
}
@@ -3076,7 +3067,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
- refill_stock(memcg, nr_pages);
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -4080,6 +4071,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ page_counter_free_stock(&memcg->memory);
+ page_counter_free_stock(&memcg->memsw);
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
@@ -4247,6 +4240,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
refcount_set(&memcg->id.ref, 1);
css_get(css);
+ /* failure is nonfatal, charges fall back to direct hierarchy */
+ page_counter_alloc_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+ if (do_memsw_account())
+ page_counter_alloc_stock(&memcg->memsw, MEMCG_CHARGE_BATCH);
+
/*
* Ensure mem_cgroup_from_private_id() works once we're fully online.
*
@@ -5502,7 +5500,7 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
- refill_stock(memcg, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
}
void mem_cgroup_flush_workqueue(void)
@@ -5555,12 +5553,9 @@ int __init mem_cgroup_init(void)
memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
WARN_ON(!memcg_wq);
- for_each_possible_cpu(cpu) {
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_memcg_stock);
+ for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
drain_local_obj_stock);
- }
memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock
2026-06-23 18:01 ` [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock Joshua Hahn
@ 2026-06-24 14:43 ` Usama Arif
2026-06-24 15:23 ` Joshua Hahn
0 siblings, 1 reply; 10+ messages in thread
From: Usama Arif @ 2026-06-24 14:43 UTC (permalink / raw)
To: Joshua Hahn
Cc: Usama Arif, Johannes Weiner, Michal Hocko, Roman Gushchin,
Shakeel Butt, Muchun Song, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
On Tue, 23 Jun 2026 11:01:22 -0700 Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> Now with all of the memcg_stock handling logic replicated in
> page_counter_stock, switch memcg to use the page_counter_stock for the
> memory (and for cgroup v1 users, memsw) page_counters.
>
> There are a few details that have changed:
>
> First, the old special-casing for the !allow_spinning check to avoid
> refilling and flushing of the old stock is removed. This special casing
> was important previously, because refilling the stock could do a lot of
> extra work by evicting one of 7 random victim memcgs in the percpu
> memcg_stock slots. In the new per-counter design, refilling stock just
> adds pages to the counter's own local cache without affecting other memcgs,
> so the original reason for the special case no longer applies.
>
> Also, we can now fail during page_counter_alloc_stock(), if there is
> not enough memory to allocate a percpu page_counter_stock. This failure
> is rare and nonfatal; the system can continue to operate, with the page
> counter working without stock and falling back to walking the hierarchy.
>
> drain_all_stock and memcg_hotplug_cpu_dead also now use the page_counter
> stock drain variant, which uses remote atomic_xchg to retrieve stock
> across CPUs, instead of scheduling asynchronous work.
>
> Finally, as a side-effect of separating the per-memcg stock to per-
> page_counter, the memsw and memory page_counters have independent stock.
> This means that the reported memsw may transiently be lower than memory
> usage if the stock for memory and memsw page_counters go out of sync.
>
> Note that obj_stock is untouched by this change.
>
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
> ---
> mm/memcontrol.c | 87 +++++++++++++++++++++++--------------------------
> 1 file changed, 41 insertions(+), 46 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 306658fd55512..846800917af49 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2269,39 +2269,36 @@ static void schedule_drain_work(int cpu, struct work_struct *work)
> queue_work_on(cpu, memcg_wq, work);
> }
>
> +static void memcg_drain_stock(struct mem_cgroup *memcg, int cpu)
> +{
> + page_counter_drain_stock(&memcg->memory, cpu);
> + if (do_memsw_account())
> + page_counter_drain_stock(&memcg->memsw, cpu);
> +}
> +
> /*
> * Drains all per-CPU charge caches for given root_memcg resp. subtree
> * of the hierarchy under it.
> */
> void drain_all_stock(struct mem_cgroup *root_memcg)
> {
> + struct mem_cgroup *memcg;
> int cpu, curcpu;
>
> /* If someone's already draining, avoid adding running more workers. */
> if (!mutex_trylock(&percpu_charge_mutex))
> return;
> - /*
> - * Notify other cpus that system-wide "drain" is running
> - * We do not care about races with the cpu hotplug because cpu down
> - * as well as workers from this path always operate on the local
> - * per-cpu data. CPU up doesn't touch memcg_stock at all.
> - */
> +
> + for_each_mem_cgroup_tree(memcg, root_memcg) {
> + for_each_online_cpu(cpu)
> + memcg_drain_stock(memcg, cpu);
> + }
> +
> migrate_disable();
> curcpu = smp_processor_id();
> for_each_online_cpu(cpu) {
> - struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
> struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
>
> - if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
> - is_memcg_drain_needed(memcg_st, root_memcg) &&
> - !test_and_set_bit(FLUSHING_CACHED_CHARGE,
> - &memcg_st->flags)) {
> - if (cpu == curcpu)
> - drain_local_memcg_stock(&memcg_st->work);
> - else
> - schedule_drain_work(cpu, &memcg_st->work);
> - }
> -
> if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
> obj_stock_flush_required(obj_st, root_memcg) &&
> !test_and_set_bit(FLUSHING_CACHED_CHARGE,
> @@ -2318,9 +2315,13 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
>
> static int memcg_hotplug_cpu_dead(unsigned int cpu)
> {
> + struct mem_cgroup *memcg;
> +
> /* no need for the local lock */
> drain_obj_stock(&per_cpu(obj_stock, cpu));
> - drain_stock_fully(&per_cpu(memcg_stock, cpu));
> +
> + for_each_mem_cgroup(memcg)
> + memcg_drain_stock(memcg, cpu);
>
> return 0;
> }
> @@ -2595,7 +2596,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
> static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> unsigned int nr_pages)
> {
> - unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> int nr_retries = MAX_RECLAIM_RETRIES;
> struct mem_cgroup *mem_over_limit;
> struct page_counter *counter;
> @@ -2606,36 +2606,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> bool raised_max_event = false;
> unsigned long pflags;
> bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
> + unsigned long nr_charged = 0;
>
> retry:
> - if (consume_stock(memcg, nr_pages))
> - return 0;
> -
> - if (!allow_spinning)
> - /* Avoid the refill and flush of the older stock */
> - batch = nr_pages;
> -
> reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> if (do_memsw_account() &&
> - !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> + !page_counter_try_charge_stock(&memcg->memsw, nr_pages,
> + &counter, NULL)) {
> mem_over_limit = mem_cgroup_from_counter(counter, memsw);
> reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
> goto reclaim;
> }
>
> - if (page_counter_try_charge(&memcg->memory, batch, &counter))
> - goto done_restock;
> + if (page_counter_try_charge_stock(&memcg->memory, nr_pages,
> + &counter, &nr_charged)) {
> + if (!nr_charged)
> + return 0;
> + goto handle_high;
> + }
>
> if (do_memsw_account())
> - page_counter_uncharge(&memcg->memsw, batch);
> + page_counter_uncharge(&memcg->memsw, nr_pages);
This needs a transactional rollback. page_counter_try_charge_stock() can
succeed by consuming memsw stock and charging 0 new pages, but the
memory-failure path unconditionally uncharges nr_pages from memsw.
That turns a failed allocation into a real memsw usage decrement.
> mem_over_limit = mem_cgroup_from_counter(counter, memory);
>
> reclaim:
> - if (batch > nr_pages) {
> - batch = nr_pages;
> - goto retry;
> - }
> -
> /*
> * Prevent unbounded recursion when reclaim operations need to
> * allocate memory. This might exceed the limits temporarily,
> @@ -2731,10 +2725,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> return 0;
>
> -done_restock:
> - if (batch > nr_pages)
> - refill_stock(memcg, batch - nr_pages);
> -
> +handle_high:
> /*
> * If the hierarchy is above the normal consumption range, schedule
> * reclaim on returning to userland. We can perform reclaim here
> @@ -2771,7 +2762,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> * and distribute reclaim work and delay penalties
> * based on how much each task is actually allocating.
> */
> - current->memcg_nr_pages_over_high += batch;
> + current->memcg_nr_pages_over_high += nr_charged;
> set_notify_resume(current);
> break;
> }
> @@ -3076,7 +3067,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
> account_kmem_nmi_safe(memcg, -nr_pages);
> memcg1_account_kmem(memcg, -nr_pages);
> if (!mem_cgroup_is_root(memcg))
> - refill_stock(memcg, nr_pages);
> + memcg_uncharge(memcg, nr_pages);
>
> css_put(&memcg->css);
> }
> @@ -4080,6 +4071,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
>
> static void mem_cgroup_free(struct mem_cgroup *memcg)
> {
> + page_counter_free_stock(&memcg->memory);
> + page_counter_free_stock(&memcg->memsw);
> lru_gen_exit_memcg(memcg);
> memcg_wb_domain_exit(memcg);
> __mem_cgroup_free(memcg);
> @@ -4247,6 +4240,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
> refcount_set(&memcg->id.ref, 1);
> css_get(css);
>
> + /* failure is nonfatal, charges fall back to direct hierarchy */
> + page_counter_alloc_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
> + if (do_memsw_account())
> + page_counter_alloc_stock(&memcg->memsw, MEMCG_CHARGE_BATCH);
> +
> /*
> * Ensure mem_cgroup_from_private_id() works once we're fully online.
> *
> @@ -5502,7 +5500,7 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
>
> mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
>
> - refill_stock(memcg, nr_pages);
> + page_counter_uncharge(&memcg->memory, nr_pages);
> }
>
> void mem_cgroup_flush_workqueue(void)
> @@ -5555,12 +5553,9 @@ int __init mem_cgroup_init(void)
> memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
> WARN_ON(!memcg_wq);
>
> - for_each_possible_cpu(cpu) {
> - INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
> - drain_local_memcg_stock);
> + for_each_possible_cpu(cpu)
> INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
> drain_local_obj_stock);
> - }
>
> memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
> memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
> --
> 2.53.0-Meta
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock
2026-06-24 14:43 ` Usama Arif
@ 2026-06-24 15:23 ` Joshua Hahn
2026-06-24 16:43 ` Usama Arif
0 siblings, 1 reply; 10+ messages in thread
From: Joshua Hahn @ 2026-06-24 15:23 UTC (permalink / raw)
To: Usama Arif
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
On Wed, 24 Jun 2026 07:43:47 -0700 Usama Arif <usama.arif@linux.dev> wrote:
> On Tue, 23 Jun 2026 11:01:22 -0700 Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
Hello Usama!!
Thank you for reviewing the patch : -)
[...snip...]
> > @@ -2595,7 +2596,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
> > static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > unsigned int nr_pages)
> > {
> > - unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> > int nr_retries = MAX_RECLAIM_RETRIES;
> > struct mem_cgroup *mem_over_limit;
> > struct page_counter *counter;
> > @@ -2606,36 +2606,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > bool raised_max_event = false;
> > unsigned long pflags;
> > bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
> > + unsigned long nr_charged = 0;
> >
> > retry:
> > - if (consume_stock(memcg, nr_pages))
> > - return 0;
> > -
> > - if (!allow_spinning)
> > - /* Avoid the refill and flush of the older stock */
> > - batch = nr_pages;
> > -
> > reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> > if (do_memsw_account() &&
> > - !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> > + !page_counter_try_charge_stock(&memcg->memsw, nr_pages,
> > + &counter, NULL)) {
> > mem_over_limit = mem_cgroup_from_counter(counter, memsw);
> > reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
> > goto reclaim;
> > }
> >
> > - if (page_counter_try_charge(&memcg->memory, batch, &counter))
> > - goto done_restock;
> > + if (page_counter_try_charge_stock(&memcg->memory, nr_pages,
> > + &counter, &nr_charged)) {
> > + if (!nr_charged)
> > + return 0;
> > + goto handle_high;
> > + }
> >
> > if (do_memsw_account())
> > - page_counter_uncharge(&memcg->memsw, batch);
> > + page_counter_uncharge(&memcg->memsw, nr_pages);
>
> This needs a transactional rollback. page_counter_try_charge_stock() can
> succeed by consuming memsw stock and charging 0 new pages, but the
> memory-failure path unconditionally uncharges nr_pages from memsw.
> That turns a failed allocation into a real memsw usage decrement.
Hmmmmmmmmmm....... I'm not sure.
At this point in the code, we are either (1) using cgroup v1 with memsw
and charged successfully, or (2) not using cgroup v1 with memsw. So I'm
not sure if this really is unconditional, we're just distinguishing
between cases (1) and (2) by checking if we're using cgroupv1.
Or is your concern with taking a charge via stock, but uncharging with
a hierarchical page_counter walk? If so, I think there's a case to be
made here with just simply returning the stock. I just wanted to keep
it consistent with the original memcontrol code, which only used
stock to fulfill charges, not uncharges, since this could make the
stock grow without bound.
What do you think? Thanks again for reviewing Usama, I hope you have a
great day!!!
Joshua
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock
2026-06-24 15:23 ` Joshua Hahn
@ 2026-06-24 16:43 ` Usama Arif
2026-06-24 18:24 ` Joshua Hahn
0 siblings, 1 reply; 10+ messages in thread
From: Usama Arif @ 2026-06-24 16:43 UTC (permalink / raw)
To: Joshua Hahn
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
On 24/06/2026 16:23, Joshua Hahn wrote:
> On Wed, 24 Jun 2026 07:43:47 -0700 Usama Arif <usama.arif@linux.dev> wrote:
>
>> On Tue, 23 Jun 2026 11:01:22 -0700 Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> Hello Usama!!
>
> Thank you for reviewing the patch : -)
>
> [...snip...]
>
>>> @@ -2595,7 +2596,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
>>> static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>>> unsigned int nr_pages)
>>> {
>>> - unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
>>> int nr_retries = MAX_RECLAIM_RETRIES;
>>> struct mem_cgroup *mem_over_limit;
>>> struct page_counter *counter;
>>> @@ -2606,36 +2606,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>>> bool raised_max_event = false;
>>> unsigned long pflags;
>>> bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
>>> + unsigned long nr_charged = 0;
>>>
>>> retry:
>>> - if (consume_stock(memcg, nr_pages))
>>> - return 0;
>>> -
>>> - if (!allow_spinning)
>>> - /* Avoid the refill and flush of the older stock */
>>> - batch = nr_pages;
>>> -
>>> reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
>>> if (do_memsw_account() &&
>>> - !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
>>> + !page_counter_try_charge_stock(&memcg->memsw, nr_pages,
>>> + &counter, NULL)) {
>>> mem_over_limit = mem_cgroup_from_counter(counter, memsw);
>>> reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
>>> goto reclaim;
>>> }
>>>
>>> - if (page_counter_try_charge(&memcg->memory, batch, &counter))
>>> - goto done_restock;
>>> + if (page_counter_try_charge_stock(&memcg->memory, nr_pages,
>>> + &counter, &nr_charged)) {
>>> + if (!nr_charged)
>>> + return 0;
>>> + goto handle_high;
>>> + }
>>>
>>> if (do_memsw_account())
>>> - page_counter_uncharge(&memcg->memsw, batch);
>>> + page_counter_uncharge(&memcg->memsw, nr_pages);
>>
>> This needs a transactional rollback. page_counter_try_charge_stock() can
>> succeed by consuming memsw stock and charging 0 new pages, but the
>> memory-failure path unconditionally uncharges nr_pages from memsw.
>> That turns a failed allocation into a real memsw usage decrement.
>
> Hmmmmmmmmmm....... I'm not sure.
>
> At this point in the code, we are either (1) using cgroup v1 with memsw
> and charged successfully, or (2) not using cgroup v1 with memsw. So I'm
> not sure if this really is unconditional, we're just distinguishing
> between cases (1) and (2) by checking if we're using cgroupv1.
>
> Or is your concern with taking a charge via stock, but uncharging with
> a hierarchical page_counter walk?
This was my concern. But I re-read the page_counter stock invariant,
and the stock-hit case is not an undercount? Consuming stock transfers
already-charged credit to the pending allocation; if the later memory charge
fails, page_counter_uncharge() discards that consumed credit from the
hierarchy. That should keeps usage equal to real charges plus remaining stock?
> If so, I think there's a case to be
> made here with just simply returning the stock. I just wanted to keep
> it consistent with the original memcontrol code, which only used
> stock to fulfill charges, not uncharges, since this could make the
> stock grow without bound.
>
> What do you think? Thanks again for reviewing Usama, I hope you have a
> great day!!!
> Joshua
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock
2026-06-24 16:43 ` Usama Arif
@ 2026-06-24 18:24 ` Joshua Hahn
0 siblings, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-06-24 18:24 UTC (permalink / raw)
To: Usama Arif
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
On Wed, 24 Jun 2026 17:43:56 +0100 Usama Arif <usama.arif@linux.dev> wrote:
>
>
> On 24/06/2026 16:23, Joshua Hahn wrote:
> > On Wed, 24 Jun 2026 07:43:47 -0700 Usama Arif <usama.arif@linux.dev> wrote:
> >
> >> On Tue, 23 Jun 2026 11:01:22 -0700 Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> >
> > Hello Usama!!
> >
> > Thank you for reviewing the patch : -)
> >
> > [...snip...]
> >
> >>> @@ -2595,7 +2596,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
> >>> static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> >>> unsigned int nr_pages)
> >>> {
> >>> - unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> >>> int nr_retries = MAX_RECLAIM_RETRIES;
> >>> struct mem_cgroup *mem_over_limit;
> >>> struct page_counter *counter;
> >>> @@ -2606,36 +2606,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> >>> bool raised_max_event = false;
> >>> unsigned long pflags;
> >>> bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
> >>> + unsigned long nr_charged = 0;
> >>>
> >>> retry:
> >>> - if (consume_stock(memcg, nr_pages))
> >>> - return 0;
> >>> -
> >>> - if (!allow_spinning)
> >>> - /* Avoid the refill and flush of the older stock */
> >>> - batch = nr_pages;
> >>> -
> >>> reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> >>> if (do_memsw_account() &&
> >>> - !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> >>> + !page_counter_try_charge_stock(&memcg->memsw, nr_pages,
> >>> + &counter, NULL)) {
> >>> mem_over_limit = mem_cgroup_from_counter(counter, memsw);
> >>> reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
> >>> goto reclaim;
> >>> }
> >>>
> >>> - if (page_counter_try_charge(&memcg->memory, batch, &counter))
> >>> - goto done_restock;
> >>> + if (page_counter_try_charge_stock(&memcg->memory, nr_pages,
> >>> + &counter, &nr_charged)) {
> >>> + if (!nr_charged)
> >>> + return 0;
> >>> + goto handle_high;
> >>> + }
> >>>
> >>> if (do_memsw_account())
> >>> - page_counter_uncharge(&memcg->memsw, batch);
> >>> + page_counter_uncharge(&memcg->memsw, nr_pages);
> >>
> >> This needs a transactional rollback. page_counter_try_charge_stock() can
> >> succeed by consuming memsw stock and charging 0 new pages, but the
> >> memory-failure path unconditionally uncharges nr_pages from memsw.
> >> That turns a failed allocation into a real memsw usage decrement.
> >
> > Hmmmmmmmmmm....... I'm not sure.
> >
> > At this point in the code, we are either (1) using cgroup v1 with memsw
> > and charged successfully, or (2) not using cgroup v1 with memsw. So I'm
> > not sure if this really is unconditional, we're just distinguishing
> > between cases (1) and (2) by checking if we're using cgroupv1.
> >
> > Or is your concern with taking a charge via stock, but uncharging with
> > a hierarchical page_counter walk?
>
> This was my concern. But I re-read the page_counter stock invariant,
> and the stock-hit case is not an undercount? Consuming stock transfers
> already-charged credit to the pending allocation; if the later memory charge
> fails, page_counter_uncharge() discards that consumed credit from the
> hierarchy. That should keeps usage equal to real charges plus remaining stock?
Yes, stock-hit case just does some math without doing any actual
charging. It's stuff that was pre-charged before, so we're not doing
any undercounting or leaking any charges.
What do you mean by "consumed credit"? From what I can see
page_counter_uncharge --> page_counter_cancel subtracts from
counter->usage, which should be the real charge + hierarchy walk.
Am I missing something :p please feel free to let me know!
Joshua
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v4 5/5] mm/memcontrol: remove unused memcg_stock code
2026-06-23 18:01 [PATCH v4 0/5] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (3 preceding siblings ...)
2026-06-23 18:01 ` [PATCH v4 4/5] mm/memcontrol: convert memcg to use page_counter_stock Joshua Hahn
@ 2026-06-23 18:01 ` Joshua Hahn
4 siblings, 0 replies; 10+ messages in thread
From: Joshua Hahn @ 2026-06-23 18:01 UTC (permalink / raw)
To: Johannes Weiner, Michal Hocko
Cc: Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, cgroups,
linux-mm, linux-kernel, kernel-team
Now that all memcg_stock logic has been moved to page_counter_stock, we
can remove all code related to handling memcg_stock. Note that obj_stock
is untouched and is still needed. FLUSHING_CACHED_CHARGE is preserved
so that it can be used by obj_stock as well.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 186 ------------------------------------------------
1 file changed, 186 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 846800917af49..762fb8914c308 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1998,25 +1998,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
-/*
- * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
- * nr_pages in a single cacheline. This may change in future.
- */
-#define NR_MEMCG_STOCK 7
#define FLUSHING_CACHED_CHARGE 0
-struct memcg_stock_pcp {
- local_trylock_t lock;
- uint8_t nr_pages[NR_MEMCG_STOCK];
- struct mem_cgroup *cached[NR_MEMCG_STOCK];
-
- struct work_struct work;
- unsigned long flags;
- uint8_t drain_idx;
-};
-
-static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
- .lock = INIT_LOCAL_TRYLOCK(lock),
-};
/*
* NR_OBJ_STOCK is sized so the entire hot path of obj_stock_pcp
@@ -2065,47 +2047,6 @@ static void drain_obj_stock(struct obj_stock_pcp *stock);
static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
-/**
- * consume_stock: Try to consume stocked charge on this cpu.
- * @memcg: memcg to consume from.
- * @nr_pages: how many pages to charge.
- *
- * Consume the cached charge if enough nr_pages are present otherwise return
- * failure. Also return failure for charge request larger than
- * MEMCG_CHARGE_BATCH or if the local lock is already taken.
- *
- * returns true if successful, false otherwise.
- */
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- struct memcg_stock_pcp *stock;
- uint8_t stock_pages;
- bool ret = false;
- int i;
-
- if (nr_pages > MEMCG_CHARGE_BATCH ||
- !local_trylock(&memcg_stock.lock))
- return ret;
-
- stock = this_cpu_ptr(&memcg_stock);
-
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- if (memcg != READ_ONCE(stock->cached[i]))
- continue;
-
- stock_pages = READ_ONCE(stock->nr_pages[i]);
- if (stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
- ret = true;
- }
- break;
- }
-
- local_unlock(&memcg_stock.lock);
-
- return ret;
-}
-
static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
@@ -2113,51 +2054,6 @@ static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-/*
- * Returns stocks cached in percpu and reset cached information.
- */
-static void drain_stock(struct memcg_stock_pcp *stock, int i)
-{
- struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
- uint8_t stock_pages;
-
- if (!old)
- return;
-
- stock_pages = READ_ONCE(stock->nr_pages[i]);
- if (stock_pages) {
- memcg_uncharge(old, stock_pages);
- WRITE_ONCE(stock->nr_pages[i], 0);
- }
-
- css_put(&old->css);
- WRITE_ONCE(stock->cached[i], NULL);
-}
-
-static void drain_stock_fully(struct memcg_stock_pcp *stock)
-{
- int i;
-
- for (i = 0; i < NR_MEMCG_STOCK; ++i)
- drain_stock(stock, i);
-}
-
-static void drain_local_memcg_stock(struct work_struct *dummy)
-{
- struct memcg_stock_pcp *stock;
-
- if (WARN_ONCE(!in_task(), "drain in non-task context"))
- return;
-
- local_lock(&memcg_stock.lock);
-
- stock = this_cpu_ptr(&memcg_stock);
- drain_stock_fully(stock);
- clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
-
- local_unlock(&memcg_stock.lock);
-}
-
static void drain_local_obj_stock(struct work_struct *dummy)
{
struct obj_stock_pcp *stock;
@@ -2174,88 +2070,6 @@ static void drain_local_obj_stock(struct work_struct *dummy)
local_unlock(&obj_stock.lock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- struct memcg_stock_pcp *stock;
- struct mem_cgroup *cached;
- uint8_t stock_pages;
- bool success = false;
- int empty_slot = -1;
- int i;
-
- /*
- * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
- * decide to increase it more than 127 then we will need more careful
- * handling of nr_pages[] in struct memcg_stock_pcp.
- */
- BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
-
- VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
-
- if (nr_pages > MEMCG_CHARGE_BATCH ||
- !local_trylock(&memcg_stock.lock)) {
- /*
- * In case of larger than batch refill or unlikely failure to
- * lock the percpu memcg_stock.lock, uncharge memcg directly.
- */
- memcg_uncharge(memcg, nr_pages);
- return;
- }
-
- stock = this_cpu_ptr(&memcg_stock);
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- cached = READ_ONCE(stock->cached[i]);
- if (!cached && empty_slot == -1)
- empty_slot = i;
- if (memcg == READ_ONCE(stock->cached[i])) {
- stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
- WRITE_ONCE(stock->nr_pages[i], stock_pages);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock, i);
- success = true;
- break;
- }
- }
-
- if (!success) {
- i = empty_slot;
- if (i == -1) {
- i = stock->drain_idx++;
- if (stock->drain_idx == NR_MEMCG_STOCK)
- stock->drain_idx = 0;
- drain_stock(stock, i);
- }
- css_get(&memcg->css);
- WRITE_ONCE(stock->cached[i], memcg);
- WRITE_ONCE(stock->nr_pages[i], nr_pages);
- }
-
- local_unlock(&memcg_stock.lock);
-}
-
-static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
- struct mem_cgroup *root_memcg)
-{
- struct mem_cgroup *memcg;
- bool flush = false;
- int i;
-
- rcu_read_lock();
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- memcg = READ_ONCE(stock->cached[i]);
- if (!memcg)
- continue;
-
- if (READ_ONCE(stock->nr_pages[i]) &&
- mem_cgroup_is_descendant(memcg, root_memcg)) {
- flush = true;
- break;
- }
- }
- rcu_read_unlock();
- return flush;
-}
-
static void schedule_drain_work(int cpu, struct work_struct *work)
{
/*
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 10+ messages in thread