* [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-02-26 19:29 [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Joshua Hahn
@ 2026-02-26 19:29 ` Joshua Hahn
2026-03-03 23:53 ` Yosry Ahmed
2026-02-26 19:29 ` [PATCH 7/8] mm/memcontrol: Track MEMCG_ZSWAPPED in bytes Joshua Hahn
` (2 subsequent siblings)
3 siblings, 1 reply; 17+ messages in thread
From: Joshua Hahn @ 2026-02-26 19:29 UTC (permalink / raw)
To: Minchan Kim, Sergey Senozhatsky
Cc: Johannes Weiner, Yosry Ahmed, Nhat Pham, Nhat Pham,
Chengming Zhou, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Andrew Morton, cgroups, linux-mm, linux-kernel,
kernel-team
Now that zswap_entries do not directly track obj_cgroups of the entries,
handle the lifetime management and charging of these entries into the
zsmalloc layer.
One functional change is that zswap entries are now no longer accounted
by the size of the compressed object, but by the size of the size_class
slot they occupy.
This brings the charging one step closer to an accurate representation
of the memory consumed in the zpdesc; even if a compressed object
doesn't consume the entirety of a obj slot, the hole it creates between
the objects is dead space the obj is accountable for.
Thus, account the memory each object makes unusable, not the amount of
memory each object takes up.
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/memcontrol.h | 10 -------
mm/memcontrol.c | 51 ----------------------------------
mm/zsmalloc.c | 57 ++++++++++++++++++++++++++++++++++++--
mm/zswap.c | 8 ------
4 files changed, 55 insertions(+), 71 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6c82c8f73e1..dd4278b1ca35 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1824,22 +1824,12 @@ static inline bool memcg_is_dying(struct mem_cgroup *memcg)
#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
-void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
-void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
return true;
}
-static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
- size_t size)
-{
-}
-static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
- size_t size)
-{
-}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
/* if zswap is disabled, do not block pages going to the swapping device */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 007413a53b45..3432e1afc037 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5433,57 +5433,6 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
return ret;
}
-/**
- * obj_cgroup_charge_zswap - charge compression backend memory
- * @objcg: the object cgroup
- * @size: size of compressed object
- *
- * This forces the charge after obj_cgroup_may_zswap() allowed
- * compression and storage in zswap for this cgroup to go ahead.
- */
-void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
-{
- struct mem_cgroup *memcg;
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return;
-
- VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
-
- /* PF_MEMALLOC context, charging must succeed */
- if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
- VM_WARN_ON_ONCE(1);
-
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
- rcu_read_unlock();
-}
-
-/**
- * obj_cgroup_uncharge_zswap - uncharge compression backend memory
- * @objcg: the object cgroup
- * @size: size of compressed object
- *
- * Uncharges zswap memory on page in.
- */
-void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
-{
- struct mem_cgroup *memcg;
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return;
-
- obj_cgroup_uncharge(objcg, size);
-
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
- rcu_read_unlock();
-}
-
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
/* if zswap is disabled, do not block pages going to the swapping device */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 067215a6ddcc..88c7cd399261 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
return true;
}
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
+
+ /* PF_MEMALLOC context, charging must succeed */
+ if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
+ VM_WARN_ON_ONCE(1);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+ rcu_read_unlock();
+}
+
+static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ obj_cgroup_uncharge(objcg, size);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+ rcu_read_unlock();
+}
+
static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
int size)
{
@@ -1018,6 +1056,12 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
return true;
}
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset) {}
+
+static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset) {}
+
static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
int size) {}
@@ -1334,8 +1378,11 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (objcg)
+ if (objcg) {
+ obj_cgroup_get(objcg);
+ zs_charge_objcg(zpdesc, objcg, class->size, off);
zpdesc_set_obj_cgroup(zpdesc, obj_idx, class->size, objcg);
+ }
if (!ZsHugePage(zspage))
off += ZS_HANDLE_SIZE;
@@ -1501,6 +1548,7 @@ static void obj_free(int class_size, unsigned long obj)
struct link_free *link;
struct zspage *zspage;
struct zpdesc *f_zpdesc;
+ struct obj_cgroup *objcg;
unsigned long f_offset;
unsigned int f_objidx;
void *vaddr;
@@ -1510,7 +1558,12 @@ static void obj_free(int class_size, unsigned long obj)
f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_zpdesc);
- zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL);
+ objcg = zpdesc_obj_cgroup(f_zpdesc, f_objidx, class_size);
+ if (objcg) {
+ zs_uncharge_objcg(f_zpdesc, objcg, class_size, f_offset);
+ obj_cgroup_put(objcg);
+ zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL);
+ }
vaddr = kmap_local_zpdesc(f_zpdesc);
link = (struct link_free *)(vaddr + f_offset);
diff --git a/mm/zswap.c b/mm/zswap.c
index 55161a5c9d4c..77d3c6516ed3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -711,10 +711,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
zs_free(entry->pool->zs_pool, entry->handle);
zswap_pool_put(entry->pool);
- if (objcg) {
- obj_cgroup_uncharge_zswap(objcg, entry->length);
- obj_cgroup_put(objcg);
- }
if (entry->length == PAGE_SIZE)
atomic_long_dec(&zswap_stored_incompressible_pages);
zswap_entry_cache_free(entry);
@@ -1437,10 +1433,6 @@ static bool zswap_store_page(struct page *page,
* when the entry is removed from the tree.
*/
zswap_pool_get(pool);
- if (objcg) {
- obj_cgroup_get(objcg);
- obj_cgroup_charge_zswap(objcg, entry->length);
- }
atomic_long_inc(&zswap_stored_pages);
if (entry->length == PAGE_SIZE)
atomic_long_inc(&zswap_stored_incompressible_pages);
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-02-26 19:29 ` [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc Joshua Hahn
@ 2026-03-03 23:53 ` Yosry Ahmed
2026-03-04 15:11 ` Joshua Hahn
0 siblings, 1 reply; 17+ messages in thread
From: Yosry Ahmed @ 2026-03-03 23:53 UTC (permalink / raw)
To: Joshua Hahn
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 067215a6ddcc..88c7cd399261 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> return true;
> }
>
> +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> + int size, unsigned long offset)
> +{
> + struct mem_cgroup *memcg;
> +
> + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> + return;
> +
> + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> +
> + /* PF_MEMALLOC context, charging must succeed */
> + if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> + VM_WARN_ON_ONCE(1);
> +
> + rcu_read_lock();
> + memcg = obj_cgroup_memcg(objcg);
> + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
Zsmalloc should not be updating zswap stats (e.g. in case zram starts
supporting memcg charging). How about moving the stat updates to
zswap?
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-03 23:53 ` Yosry Ahmed
@ 2026-03-04 15:11 ` Joshua Hahn
2026-03-04 15:46 ` Yosry Ahmed
0 siblings, 1 reply; 17+ messages in thread
From: Joshua Hahn @ 2026-03-04 15:11 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > index 067215a6ddcc..88c7cd399261 100644
> > --- a/mm/zsmalloc.c
> > +++ b/mm/zsmalloc.c
> > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > return true;
> > }
> >
> > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > + int size, unsigned long offset)
> > +{
> > + struct mem_cgroup *memcg;
> > +
> > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > + return;
> > +
> > + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > +
> > + /* PF_MEMALLOC context, charging must succeed */
> > + if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > + VM_WARN_ON_ONCE(1);
> > +
> > + rcu_read_lock();
> > + memcg = obj_cgroup_memcg(objcg);
> > + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
Hello Yosry, I hope you are doing well!
Thank you for your feedback : -)
> Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> supporting memcg charging). How about moving the stat updates to
> zswap?
Yeah... I think this was also a big point of concern for me. While reading
the code, I was really amazed by how clean the logical divide between
zsmalloc and zswap / zram were, and I wanted to preserve it as much as
possible.
There are a few problems, though. Probably the biggest is that migration
of zpdescs and compressed objects within them are invisible to zswap.
Of course, this is by design, but this leads to two problems.
zswap's ignorance of compressed objects' movements across physical nodes
makes it impossible to accurately charge and uncharge from the correct
memcg-lruvec.
Conversely, zsmalloc's ignorance of memcg association makes it impossible
to correctly restrict cpusets.mems during migration.
So the clean logical divide makes a lot of sense for separating the
high-level cgroup association, compression, etc. from the physical
location of the memory and migration / zpdesc compaction, but it would
appear that this comes at a cost of oversimplifying the logic and missing
out on accurate memory charging and a unified source of truth for the
counters.
The last thing I wanted to note was that I agree that zsmalloc doing
explicit zswap stat updates feels a bit awkward. The reason I chose to do
this right now is because when enlightening zsmalloc about the compressed
objs' objcgs, zswap is the only one that does this memory accounting.
So having an objcg is a bit of a proxy to understand that the consumer
is zswap (as opposed to zram). Of course, if zram starts to do memcg
accounting as well, we'll have to start doing some other checks to
see if the compresed object should be accounted as zram or zswap.
OK. That's all the defense I have for my design : -) Now for thinking
about other designs:
I also explored whether it makes sense to make zsmalloc call a hook into
zswap code during and after migrations. The problem is that there isn't
a good way to do the compressed object --> zswap entry lookup, and this
still doesn't solve the issue of zsmalloc migrating compressed objects
without checking whether that object can live on another node.
Maybe one possible approach is to turn the array of objcgs into an array
of backpointers from compressed objects to their corresponding zswap_entries?
One concern is that this does add 8 bytes of additional overhead per
zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
on whether there's a creative way to save some memory here, though...
Of course the other concern is what this will look like for zram users.
I guess it can be done similarly to what is done here, and only allocate
the array of pointers when called in from zswap.
Anyways, thank you for bringing this up. What do you think about the
options we have here? I hope that I've motivated why we want
per-memcg-lruvec accounting as well. Please let me know if there is anything
I can provide additional context for : -)
Have a great day!
Joshua
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-04 15:11 ` Joshua Hahn
@ 2026-03-04 15:46 ` Yosry Ahmed
2026-03-04 16:26 ` Joshua Hahn
2026-03-04 16:27 ` Nhat Pham
0 siblings, 2 replies; 17+ messages in thread
From: Yosry Ahmed @ 2026-03-04 15:46 UTC (permalink / raw)
To: Joshua Hahn
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
>
> > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > index 067215a6ddcc..88c7cd399261 100644
> > > --- a/mm/zsmalloc.c
> > > +++ b/mm/zsmalloc.c
> > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > > return true;
> > > }
> > >
> > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > + int size, unsigned long offset)
> > > +{
> > > + struct mem_cgroup *memcg;
> > > +
> > > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > + return;
> > > +
> > > + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > +
> > > + /* PF_MEMALLOC context, charging must succeed */
> > > + if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > + VM_WARN_ON_ONCE(1);
> > > +
> > > + rcu_read_lock();
> > > + memcg = obj_cgroup_memcg(objcg);
> > > + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
>
> Hello Yosry, I hope you are doing well!
> Thank you for your feedback : -)
>
> > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > supporting memcg charging). How about moving the stat updates to
> > zswap?
>
> Yeah... I think this was also a big point of concern for me. While reading
> the code, I was really amazed by how clean the logical divide between
> zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> possible.
>
> There are a few problems, though. Probably the biggest is that migration
> of zpdescs and compressed objects within them are invisible to zswap.
> Of course, this is by design, but this leads to two problems.
>
> zswap's ignorance of compressed objects' movements across physical nodes
> makes it impossible to accurately charge and uncharge from the correct
> memcg-lruvec.
>
> Conversely, zsmalloc's ignorance of memcg association makes it impossible
> to correctly restrict cpusets.mems during migration.
>
> So the clean logical divide makes a lot of sense for separating the
> high-level cgroup association, compression, etc. from the physical
> location of the memory and migration / zpdesc compaction, but it would
> appear that this comes at a cost of oversimplifying the logic and missing
> out on accurate memory charging and a unified source of truth for the
> counters.
>
> The last thing I wanted to note was that I agree that zsmalloc doing
> explicit zswap stat updates feels a bit awkward. The reason I chose to do
> this right now is because when enlightening zsmalloc about the compressed
> objs' objcgs, zswap is the only one that does this memory accounting.
> So having an objcg is a bit of a proxy to understand that the consumer
> is zswap (as opposed to zram). Of course, if zram starts to do memcg
> accounting as well, we'll have to start doing some other checks to
> see if the compresed object should be accounted as zram or zswap.
>
> OK. That's all the defense I have for my design : -) Now for thinking
> about other designs:
>
> I also explored whether it makes sense to make zsmalloc call a hook into
> zswap code during and after migrations. The problem is that there isn't
> a good way to do the compressed object --> zswap entry lookup, and this
> still doesn't solve the issue of zsmalloc migrating compressed objects
> without checking whether that object can live on another node.
>
> Maybe one possible approach is to turn the array of objcgs into an array
> of backpointers from compressed objects to their corresponding zswap_entries?
> One concern is that this does add 8 bytes of additional overhead per
> zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> on whether there's a creative way to save some memory here, though...
>
> Of course the other concern is what this will look like for zram users.
> I guess it can be done similarly to what is done here, and only allocate
> the array of pointers when called in from zswap.
>
> Anyways, thank you for bringing this up. What do you think about the
> options we have here? I hope that I've motivated why we want
> per-memcg-lruvec accounting as well. Please let me know if there is anything
> I can provide additional context for : -)
Thanks for the detailed elaboration.
AFAICT the only zswap-specific part is the actual stat indexes, what
if these are parameterized at the zsmalloc pool level? AFAICT zswap
and zram will never share a pool.
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-04 15:46 ` Yosry Ahmed
@ 2026-03-04 16:26 ` Joshua Hahn
2026-03-04 16:27 ` Nhat Pham
1 sibling, 0 replies; 17+ messages in thread
From: Joshua Hahn @ 2026-03-04 16:26 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
On Wed, 4 Mar 2026 07:46:48 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> >
> > On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > > index 067215a6ddcc..88c7cd399261 100644
> > > > --- a/mm/zsmalloc.c
> > > > +++ b/mm/zsmalloc.c
> > > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > > > return true;
> > > > }
> > > >
> > > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > > + int size, unsigned long offset)
> > > > +{
> > > > + struct mem_cgroup *memcg;
> > > > +
> > > > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > + return;
> > > > +
> > > > + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > > +
> > > > + /* PF_MEMALLOC context, charging must succeed */
> > > > + if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > > + VM_WARN_ON_ONCE(1);
> > > > +
> > > > + rcu_read_lock();
> > > > + memcg = obj_cgroup_memcg(objcg);
> > > > + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > > + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
> >
> > Hello Yosry, I hope you are doing well!
> > Thank you for your feedback : -)
> >
> > > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > > supporting memcg charging). How about moving the stat updates to
> > > zswap?
> >
> > Yeah... I think this was also a big point of concern for me. While reading
> > the code, I was really amazed by how clean the logical divide between
> > zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> > possible.
> >
> > There are a few problems, though. Probably the biggest is that migration
> > of zpdescs and compressed objects within them are invisible to zswap.
> > Of course, this is by design, but this leads to two problems.
> >
> > zswap's ignorance of compressed objects' movements across physical nodes
> > makes it impossible to accurately charge and uncharge from the correct
> > memcg-lruvec.
> >
> > Conversely, zsmalloc's ignorance of memcg association makes it impossible
> > to correctly restrict cpusets.mems during migration.
> >
> > So the clean logical divide makes a lot of sense for separating the
> > high-level cgroup association, compression, etc. from the physical
> > location of the memory and migration / zpdesc compaction, but it would
> > appear that this comes at a cost of oversimplifying the logic and missing
> > out on accurate memory charging and a unified source of truth for the
> > counters.
> >
> > The last thing I wanted to note was that I agree that zsmalloc doing
> > explicit zswap stat updates feels a bit awkward. The reason I chose to do
> > this right now is because when enlightening zsmalloc about the compressed
> > objs' objcgs, zswap is the only one that does this memory accounting.
> > So having an objcg is a bit of a proxy to understand that the consumer
> > is zswap (as opposed to zram). Of course, if zram starts to do memcg
> > accounting as well, we'll have to start doing some other checks to
> > see if the compresed object should be accounted as zram or zswap.
> >
> > OK. That's all the defense I have for my design : -) Now for thinking
> > about other designs:
> >
> > I also explored whether it makes sense to make zsmalloc call a hook into
> > zswap code during and after migrations. The problem is that there isn't
> > a good way to do the compressed object --> zswap entry lookup, and this
> > still doesn't solve the issue of zsmalloc migrating compressed objects
> > without checking whether that object can live on another node.
> >
> > Maybe one possible approach is to turn the array of objcgs into an array
> > of backpointers from compressed objects to their corresponding zswap_entries?
> > One concern is that this does add 8 bytes of additional overhead per
> > zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> > on whether there's a creative way to save some memory here, though...
> >
> > Of course the other concern is what this will look like for zram users.
> > I guess it can be done similarly to what is done here, and only allocate
> > the array of pointers when called in from zswap.
> >
> > Anyways, thank you for bringing this up. What do you think about the
> > options we have here? I hope that I've motivated why we want
> > per-memcg-lruvec accounting as well. Please let me know if there is anything
> > I can provide additional context for : -)
>
> Thanks for the detailed elaboration.
>
> AFAICT the only zswap-specific part is the actual stat indexes, what
> if these are parameterized at the zsmalloc pool level? AFAICT zswap
> and zram will never share a pool.
That's a great idea, we can abstract the ZSWAP and ZSWAPPED idxs as
"compressed" and "uncompressed" and leave the flexibility for zram
to do similar accounting in the future if they wish to.
Thanks for the suggestion, Yosry. I'll include this in the v2 and
send it out! I hope you have a great day!!
Joshua
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-04 15:46 ` Yosry Ahmed
2026-03-04 16:26 ` Joshua Hahn
@ 2026-03-04 16:27 ` Nhat Pham
2026-03-04 16:45 ` Yosry Ahmed
1 sibling, 1 reply; 17+ messages in thread
From: Nhat Pham @ 2026-03-04 16:27 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Joshua Hahn, Minchan Kim, Sergey Senozhatsky, Johannes Weiner,
Yosry Ahmed, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
On Wed, Mar 4, 2026 at 7:47 AM Yosry Ahmed <yosry@kernel.org> wrote:
>
> On Wed, Mar 4, 2026 at 7:11 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
> >
> > On Tue, 3 Mar 2026 15:53:31 -0800 Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> > > > index 067215a6ddcc..88c7cd399261 100644
> > > > --- a/mm/zsmalloc.c
> > > > +++ b/mm/zsmalloc.c
> > > > @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
> > > > return true;
> > > > }
> > > >
> > > > +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
> > > > + int size, unsigned long offset)
> > > > +{
> > > > + struct mem_cgroup *memcg;
> > > > +
> > > > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > > > + return;
> > > > +
> > > > + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
> > > > +
> > > > + /* PF_MEMALLOC context, charging must succeed */
> > > > + if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
> > > > + VM_WARN_ON_ONCE(1);
> > > > +
> > > > + rcu_read_lock();
> > > > + memcg = obj_cgroup_memcg(objcg);
> > > > + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
> > > > + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
> >
> > Hello Yosry, I hope you are doing well!
> > Thank you for your feedback : -)
> >
> > > Zsmalloc should not be updating zswap stats (e.g. in case zram starts
> > > supporting memcg charging). How about moving the stat updates to
> > > zswap?
> >
> > Yeah... I think this was also a big point of concern for me. While reading
> > the code, I was really amazed by how clean the logical divide between
> > zsmalloc and zswap / zram were, and I wanted to preserve it as much as
> > possible.
> >
> > There are a few problems, though. Probably the biggest is that migration
> > of zpdescs and compressed objects within them are invisible to zswap.
> > Of course, this is by design, but this leads to two problems.
> >
> > zswap's ignorance of compressed objects' movements across physical nodes
> > makes it impossible to accurately charge and uncharge from the correct
> > memcg-lruvec.
> >
> > Conversely, zsmalloc's ignorance of memcg association makes it impossible
> > to correctly restrict cpusets.mems during migration.
> >
> > So the clean logical divide makes a lot of sense for separating the
> > high-level cgroup association, compression, etc. from the physical
> > location of the memory and migration / zpdesc compaction, but it would
> > appear that this comes at a cost of oversimplifying the logic and missing
> > out on accurate memory charging and a unified source of truth for the
> > counters.
> >
> > The last thing I wanted to note was that I agree that zsmalloc doing
> > explicit zswap stat updates feels a bit awkward. The reason I chose to do
> > this right now is because when enlightening zsmalloc about the compressed
> > objs' objcgs, zswap is the only one that does this memory accounting.
> > So having an objcg is a bit of a proxy to understand that the consumer
> > is zswap (as opposed to zram). Of course, if zram starts to do memcg
> > accounting as well, we'll have to start doing some other checks to
> > see if the compresed object should be accounted as zram or zswap.
> >
> > OK. That's all the defense I have for my design : -) Now for thinking
> > about other designs:
> >
> > I also explored whether it makes sense to make zsmalloc call a hook into
> > zswap code during and after migrations. The problem is that there isn't
> > a good way to do the compressed object --> zswap entry lookup, and this
> > still doesn't solve the issue of zsmalloc migrating compressed objects
> > without checking whether that object can live on another node.
> >
> > Maybe one possible approach is to turn the array of objcgs into an array
> > of backpointers from compressed objects to their corresponding zswap_entries?
> > One concern is that this does add 8 bytes of additional overhead per
> > zswap entry, and I'm not sure that this is acceptable. I'll keep thinking
> > on whether there's a creative way to save some memory here, though...
> >
> > Of course the other concern is what this will look like for zram users.
> > I guess it can be done similarly to what is done here, and only allocate
> > the array of pointers when called in from zswap.
> >
> > Anyways, thank you for bringing this up. What do you think about the
> > options we have here? I hope that I've motivated why we want
> > per-memcg-lruvec accounting as well. Please let me know if there is anything
> > I can provide additional context for : -)
>
> Thanks for the detailed elaboration.
>
> AFAICT the only zswap-specific part is the actual stat indexes, what
> if these are parameterized at the zsmalloc pool level? AFAICT zswap
> and zram will never share a pool.
TBH, if we were to start from scratch, these should be zsmalloc
counters not zswap counters. Only zsmalloc knows about the memory
placement and real memory consumption (i.e taking into account
intra-slot wasted space) - this information is abstracted away from
all of the callers. And if/when zram supports cgroup tracking, memory
used by zswap and memory used by zram is indistinguishable, no?
Anyway, Joshua, do you think this is doable? Seems promising to me,
but idk if it will be clean to implement or not.
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-04 16:27 ` Nhat Pham
@ 2026-03-04 16:45 ` Yosry Ahmed
2026-03-04 16:49 ` Nhat Pham
0 siblings, 1 reply; 17+ messages in thread
From: Yosry Ahmed @ 2026-03-04 16:45 UTC (permalink / raw)
To: Nhat Pham
Cc: Joshua Hahn, Minchan Kim, Sergey Senozhatsky, Johannes Weiner,
Yosry Ahmed, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
> > AFAICT the only zswap-specific part is the actual stat indexes, what
> > if these are parameterized at the zsmalloc pool level? AFAICT zswap
> > and zram will never share a pool.
>
> TBH, if we were to start from scratch, these should be zsmalloc
> counters not zswap counters. Only zsmalloc knows about the memory
> placement and real memory consumption (i.e taking into account
> intra-slot wasted space) - this information is abstracted away from
> all of the callers.
I agree, but we cannot change the zswap stats now that we added them.
Keep in mind that when they were added zsmalloc was not the only
backend.
> And if/when zram supports cgroup tracking, memory
> used by zswap and memory used by zram is indistinguishable, no?
It is distinguishable as long as they use different zsmalloc pools, I
don't see why we'd need to share a pool.
> Anyway, Joshua, do you think this is doable? Seems promising to me,
> but idk if it will be clean to implement or not.
Not sure what you mean here? Changing the stats to be zsmalloc-based?
IIUC we can't do this without breaking userspace.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
2026-03-04 16:45 ` Yosry Ahmed
@ 2026-03-04 16:49 ` Nhat Pham
0 siblings, 0 replies; 17+ messages in thread
From: Nhat Pham @ 2026-03-04 16:49 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Joshua Hahn, Minchan Kim, Sergey Senozhatsky, Johannes Weiner,
Yosry Ahmed, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton, cgroups,
linux-mm, linux-kernel, kernel-team
On Wed, Mar 4, 2026 at 8:46 AM Yosry Ahmed <yosry@kernel.org> wrote:
>
> > > AFAICT the only zswap-specific part is the actual stat indexes, what
> > > if these are parameterized at the zsmalloc pool level? AFAICT zswap
> > > and zram will never share a pool.
> >
> > TBH, if we were to start from scratch, these should be zsmalloc
> > counters not zswap counters. Only zsmalloc knows about the memory
> > placement and real memory consumption (i.e taking into account
> > intra-slot wasted space) - this information is abstracted away from
> > all of the callers.
>
> I agree, but we cannot change the zswap stats now that we added them.
> Keep in mind that when they were added zsmalloc was not the only
> backend.
>
> > And if/when zram supports cgroup tracking, memory
> > used by zswap and memory used by zram is indistinguishable, no?
>
> It is distinguishable as long as they use different zsmalloc pools, I
> don't see why we'd need to share a pool.
>
> > Anyway, Joshua, do you think this is doable? Seems promising to me,
> > but idk if it will be clean to implement or not.
>
> Not sure what you mean here? Changing the stats to be zsmalloc-based?
> IIUC we can't do this without breaking userspace.
No I meant your proposal haha. Sorry for being confusing - probably
need more caffeine :)
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH 7/8] mm/memcontrol: Track MEMCG_ZSWAPPED in bytes
2026-02-26 19:29 [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Joshua Hahn
2026-02-26 19:29 ` [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc Joshua Hahn
@ 2026-02-26 19:29 ` Joshua Hahn
2026-02-26 19:29 ` [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec Joshua Hahn
2026-03-02 21:31 ` [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Nhat Pham
3 siblings, 0 replies; 17+ messages in thread
From: Joshua Hahn @ 2026-02-26 19:29 UTC (permalink / raw)
To: Minchan Kim, Sergey Senozhatsky
Cc: Johannes Weiner, Yosry Ahmed, Nhat Pham, Nhat Pham,
Chengming Zhou, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Andrew Morton, cgroups, linux-mm, linux-kernel,
kernel-team
Zswap compresses and uncompresses in PAGE_SIZE units, which simplifies
the accounting for how much memory it has compressed. However, when a
compressed object is stored at the boundary of two zspages, accounting
at PAGE_SIZE units makes it difficult to fractionally charge each
backing zspage with the ratio of memory it backs for the compressed
object.
To make sub-PAGE_SIZE granularity charging possible for MEMCG_ZSWAPPED,
track the value in bytes and adjust its accounting accordingly.
No functional changes intended.
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/memcontrol.h | 2 +-
mm/memcontrol.c | 5 +++--
mm/zsmalloc.c | 4 ++--
mm/zswap.c | 6 ++++--
4 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dd4278b1ca35..d3952c918fd4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -38,7 +38,7 @@ enum memcg_stat_item {
MEMCG_VMALLOC,
MEMCG_KMEM,
MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED,
+ MEMCG_ZSWAPPED_B,
MEMCG_NR_STAT,
};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3432e1afc037..b662902d4e03 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -340,7 +340,7 @@ static const unsigned int memcg_stat_items[] = {
MEMCG_VMALLOC,
MEMCG_KMEM,
MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED,
+ MEMCG_ZSWAPPED_B,
};
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
@@ -1345,7 +1345,7 @@ static const struct memory_stat memory_stats[] = {
{ "shmem", NR_SHMEM },
#ifdef CONFIG_ZSWAP
{ "zswap", MEMCG_ZSWAP_B },
- { "zswapped", MEMCG_ZSWAPPED },
+ { "zswapped", MEMCG_ZSWAPPED_B },
#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@@ -1393,6 +1393,7 @@ static int memcg_page_state_unit(int item)
switch (item) {
case MEMCG_PERCPU_B:
case MEMCG_ZSWAP_B:
+ case MEMCG_ZSWAPPED_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
return 1;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 88c7cd399261..6794927c60fb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -980,7 +980,7 @@ static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, 1);
rcu_read_unlock();
}
@@ -997,7 +997,7 @@ static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, -1);
rcu_read_unlock();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 77d3c6516ed3..97f38d0afa86 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1214,8 +1214,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
*/
if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ nr_backing >>= PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B);
+ nr_stored >>= PAGE_SHIFT;
} else {
nr_backing = zswap_total_pages();
nr_stored = atomic_long_read(&zswap_stored_pages);
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
2026-02-26 19:29 [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Joshua Hahn
2026-02-26 19:29 ` [PATCH 6/8] mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc Joshua Hahn
2026-02-26 19:29 ` [PATCH 7/8] mm/memcontrol: Track MEMCG_ZSWAPPED in bytes Joshua Hahn
@ 2026-02-26 19:29 ` Joshua Hahn
2026-02-26 22:40 ` kernel test robot
2026-02-26 23:02 ` kernel test robot
2026-03-02 21:31 ` [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Nhat Pham
3 siblings, 2 replies; 17+ messages in thread
From: Joshua Hahn @ 2026-02-26 19:29 UTC (permalink / raw)
To: Minchan Kim, Sergey Senozhatsky
Cc: Johannes Weiner, Yosry Ahmed, Nhat Pham, Nhat Pham,
Chengming Zhou, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Andrew Morton,
cgroups, linux-mm, linux-kernel, kernel-team
Now that memcg charging happens in the zsmalloc layer where we have both
objcg and page information, we can specify which node's memcg lruvec
zswapped memory should be accounted to.
Move MEMCG_ZSWAP_B and MEMCG_ZSWAPPED_B from enum_node_stat_item to
int memcg_node_stat_items. Rename their prefix from MEMCG to NR to
reflect this move as well.
In addition, decouple the updates of node stats (vmstat) and
memcg-lruvec stats, since node stats can only track values at a
PAGE_SIZE granularity.
Finally, track the moving charges whenever a compressed object migrates
from one zspage to another.
memcg-lruvec stats are now updated precisely and proportionally when
compressed objects are split across pages. Unfortunately for node stats,
only NR_ZSWAP_B can be kept accurate. NR_ZSWAPPED_B works as a good
best-effort value, but cannot proportionally account for compressed
objects split across pages due to the coarse PAGE_SIZE granularity
of node stats. For such objects, NR_ZSWAPPED_B is accounted to the first
zpdesc's node stats.
Note that this is not a new inaccuracy, but one that is simply left
unable to be fixed as part of these changes. The small inaccuracy is
accepted in place of invasive changes across all of vmstat
infrastructure to begin tracking stats at byte granularity.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/memcontrol.h | 5 +--
include/linux/mmzone.h | 2 ++
mm/memcontrol.c | 18 +++++-----
mm/vmstat.c | 2 ++
mm/zsmalloc.c | 72 ++++++++++++++++++++++++++++++--------
mm/zswap.c | 4 +--
6 files changed, 76 insertions(+), 27 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d3952c918fd4..ba97b86d9104 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -37,8 +37,6 @@ enum memcg_stat_item {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
MEMCG_NR_STAT,
};
@@ -932,6 +930,9 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
void mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx, int val);
+void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
+
static inline void mod_memcg_page_state(struct page *page,
enum memcg_stat_item idx, int val)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..ae16a90491ac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -258,6 +258,8 @@ enum node_stat_item {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
NR_BALLOON_PAGES,
NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b662902d4e03..dc7cfff97296 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -331,6 +331,8 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_ZSWAP_B,
+ NR_ZSWAPPED_B,
};
static const unsigned int memcg_stat_items[] = {
@@ -339,8 +341,6 @@ static const unsigned int memcg_stat_items[] = {
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
- MEMCG_ZSWAP_B,
- MEMCG_ZSWAPPED_B,
};
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
@@ -726,7 +726,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
+void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
@@ -1344,8 +1344,8 @@ static const struct memory_stat memory_stats[] = {
{ "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
#ifdef CONFIG_ZSWAP
- { "zswap", MEMCG_ZSWAP_B },
- { "zswapped", MEMCG_ZSWAPPED_B },
+ { "zswap", NR_ZSWAP_B },
+ { "zswapped", NR_ZSWAPPED_B },
#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@@ -1392,8 +1392,8 @@ static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
- case MEMCG_ZSWAP_B:
- case MEMCG_ZSWAPPED_B:
+ case NR_ZSWAP_B:
+ case NR_ZSWAPPED_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
return 1;
@@ -5424,7 +5424,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
/* Force flush to get accurate stats for charging */
__mem_cgroup_flush_stats(memcg, true);
- pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ pages = memcg_page_state(memcg, NR_ZSWAP_B) / PAGE_SIZE;
if (pages < max)
continue;
ret = false;
@@ -5453,7 +5453,7 @@ static u64 zswap_current_read(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
mem_cgroup_flush_stats(memcg);
- return memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ return memcg_page_state(memcg, NR_ZSWAP_B);
}
static int zswap_max_show(struct seq_file *m, void *v)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 99270713e0c1..4b10610bd999 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1279,6 +1279,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_HUGETLB_PAGE
[I(NR_HUGETLB)] = "nr_hugetlb",
#endif
+ [I(NR_ZSWAP_B)] = "zswap",
+ [I(NR_ZSWAPPED_B)] = "zswapped",
[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
#undef I
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 6794927c60fb..548e7f4b8bf6 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -810,6 +810,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct zpdesc *zpdesc, *next;
+ bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
assert_spin_locked(&class->lock);
@@ -823,6 +824,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
reset_zpdesc(zpdesc);
zpdesc_unlock(zpdesc);
zpdesc_dec_zone_page_state(zpdesc);
+ if (objcg)
+ dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
zpdesc_put(zpdesc);
zpdesc = next;
} while (zpdesc != NULL);
@@ -963,11 +966,45 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp,
return true;
}
-static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
- int size, unsigned long offset)
+static void __zs_mod_memcg_lruvec(struct zpdesc *zpdesc,
+ struct obj_cgroup *objcg, int size,
+ int sign, unsigned long offset)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int compressed_size = size, original_size = PAGE_SIZE;
+ int nid = page_to_nid(zpdesc_page(zpdesc));
+ int next_nid = nid;
+
+ if (offset + size > PAGE_SIZE) {
+ struct zpdesc *next_zpdesc = get_next_zpdesc(zpdesc);
+
+ next_nid = page_to_nid(zpdesc_page(next_zpdesc));
+ if (nid != next_nid) {
+ compressed_size = PAGE_SIZE - offset;
+ original_size = (PAGE_SIZE * compressed_size) / size;
+ }
+ }
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B, sign * compressed_size);
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B, sign * original_size);
+
+ if (nid != next_nid) {
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(next_nid));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B,
+ sign * (size - compressed_size));
+ mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B,
+ sign * (PAGE_SIZE - original_size));
+ }
+ rcu_read_unlock();
+}
+static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
+ int size, unsigned long offset)
+{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
@@ -977,28 +1014,30 @@ static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
VM_WARN_ON_ONCE(1);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, 1);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(zpdesc, objcg, size, 1, offset);
+
+ /*
+ * Node-level vmstats are charged in PAGE_SIZE units. As a
+ * best-effort, always charge NR_ZSWAPPED_B to the first zpdesc.
+ */
+ inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B);
}
static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg,
int size, unsigned long offset)
{
- struct mem_cgroup *memcg;
-
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
obj_cgroup_uncharge(objcg, size);
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
- mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, -1);
- rcu_read_unlock();
+ __zs_mod_memcg_lruvec(zpdesc, objcg, size, -1, offset);
+
+ /*
+ * Node-level vmstats are uncharged in PAGE_SIZE units. As a
+ * best-effort, always uncharge NR_ZSWAPPED_B to the first zpdesc.
+ */
+ dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B);
}
static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj,
@@ -1135,6 +1174,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
__zpdesc_set_zsmalloc(zpdesc);
zpdesc_inc_zone_page_state(zpdesc);
+ if (objcg)
+ inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
zpdescs[i] = zpdesc;
}
@@ -1149,6 +1190,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
err:
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
+ if (objcg)
+ dec_node_page_state(zpdesc_page(zpdescs[i]),
+ NR_ZSWAP_B);
free_zpdesc(zpdescs[i]);
}
cache_free_zspage(zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 97f38d0afa86..9e845e1d7214 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1214,9 +1214,9 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
*/
if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B);
+ nr_backing = memcg_page_state(memcg, NR_ZSWAP_B);
nr_backing >>= PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B);
+ nr_stored = memcg_page_state(memcg, NR_ZSWAPPED_B);
nr_stored >>= PAGE_SHIFT;
} else {
nr_backing = zswap_total_pages();
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
2026-02-26 19:29 ` [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec Joshua Hahn
@ 2026-02-26 22:40 ` kernel test robot
2026-02-27 19:45 ` Joshua Hahn
2026-02-26 23:02 ` kernel test robot
1 sibling, 1 reply; 17+ messages in thread
From: kernel test robot @ 2026-02-26 22:40 UTC (permalink / raw)
To: Joshua Hahn, Minchan Kim, Sergey Senozhatsky
Cc: oe-kbuild-all, Johannes Weiner, Yosry Ahmed, Nhat Pham,
Chengming Zhou, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Andrew Morton,
Linux Memory Management List, cgroups, linux-kernel, kernel-team
Hi Joshua,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on linus/master v7.0-rc1]
[cannot apply to akpm-mm/mm-everything next-20260226]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joshua-Hahn/mm-zsmalloc-Rename-zs_object_copy-to-zs_obj_copy/20260227-033239
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20260226192936.3190275-9-joshua.hahnjy%40gmail.com
patch subject: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
config: powerpc64-randconfig-r072-20260227 (https://download.01.org/0day-ci/archive/20260227/202602270607.dJP65LGH-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
smatch version: v0.5.0-8994-gd50c5a4c
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260227/202602270607.dJP65LGH-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602270607.dJP65LGH-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/zsmalloc.c:813:17: error: call to undeclared function 'zpdesc_objcgs'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
813 | bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
| ^
1 error generated.
vim +/zpdesc_objcgs +813 mm/zsmalloc.c
808
809 static void __free_zspage(struct zs_pool *pool, struct size_class *class,
810 struct zspage *zspage)
811 {
812 struct zpdesc *zpdesc, *next;
> 813 bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
814
815 assert_spin_locked(&class->lock);
816
817 VM_BUG_ON(get_zspage_inuse(zspage));
818 VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
819
820 next = zpdesc = get_first_zpdesc(zspage);
821 do {
822 VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
823 next = get_next_zpdesc(zpdesc);
824 reset_zpdesc(zpdesc);
825 zpdesc_unlock(zpdesc);
826 zpdesc_dec_zone_page_state(zpdesc);
827 if (objcg)
828 dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
829 zpdesc_put(zpdesc);
830 zpdesc = next;
831 } while (zpdesc != NULL);
832
833 cache_free_zspage(zspage);
834
835 class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
836 atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
837 }
838
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
2026-02-26 22:40 ` kernel test robot
@ 2026-02-27 19:45 ` Joshua Hahn
0 siblings, 0 replies; 17+ messages in thread
From: Joshua Hahn @ 2026-02-27 19:45 UTC (permalink / raw)
To: kernel test robot
Cc: Minchan Kim, Sergey Senozhatsky, oe-kbuild-all, Johannes Weiner,
Yosry Ahmed, Nhat Pham, Chengming Zhou, Michal Hocko,
Roman Gushchin, Shakeel Butt, Muchun Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Andrew Morton, Linux Memory Management List,
cgroups, linux-kernel, kernel-team
On Fri, 27 Feb 2026 06:40:18 +0800 kernel test robot <lkp@intel.com> wrote:
> Hi Joshua,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on axboe/for-next]
> [also build test ERROR on linus/master v7.0-rc1]
> [cannot apply to akpm-mm/mm-everything next-20260226]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Joshua-Hahn/mm-zsmalloc-Rename-zs_object_copy-to-zs_obj_copy/20260227-033239
> base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
> patch link: https://lore.kernel.org/r/20260226192936.3190275-9-joshua.hahnjy%40gmail.com
> patch subject: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
> config: powerpc64-randconfig-r072-20260227 (https://download.01.org/0day-ci/archive/20260227/202602270607.dJP65LGH-lkp@intel.com/config)
> compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
> smatch version: v0.5.0-8994-gd50c5a4c
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260227/202602270607.dJP65LGH-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202602270607.dJP65LGH-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> >> mm/zsmalloc.c:813:17: error: call to undeclared function 'zpdesc_objcgs'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> 813 | bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
> | ^
> 1 error generated.
Hi Kernel test robot,
Thanks again, this seems like the same problem of not defining
zpdesc_objcgs outside for the !ifdef CONFIG_MEMCG case. However, in this
case I think the change that needs to be made is actually to make the
charging happen unconditionally, and within the charging functions
check if objcg is present, since the node states can be updated even
without the concept of a memcg.
Thanks again!
Joshua
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
2026-02-26 19:29 ` [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec Joshua Hahn
2026-02-26 22:40 ` kernel test robot
@ 2026-02-26 23:02 ` kernel test robot
1 sibling, 0 replies; 17+ messages in thread
From: kernel test robot @ 2026-02-26 23:02 UTC (permalink / raw)
To: Joshua Hahn, Minchan Kim, Sergey Senozhatsky
Cc: oe-kbuild-all, Johannes Weiner, Yosry Ahmed, Nhat Pham,
Chengming Zhou, Michal Hocko, Roman Gushchin, Shakeel Butt,
Muchun Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Andrew Morton,
Linux Memory Management List, cgroups, linux-kernel, kernel-team
Hi Joshua,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on linus/master v7.0-rc1]
[cannot apply to akpm-mm/mm-everything next-20260226]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joshua-Hahn/mm-zsmalloc-Rename-zs_object_copy-to-zs_obj_copy/20260227-033239
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20260226192936.3190275-9-joshua.hahnjy%40gmail.com
patch subject: [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
config: m68k-defconfig (https://download.01.org/0day-ci/archive/20260227/202602270614.hOv7KIkV-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260227/202602270614.hOv7KIkV-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602270614.hOv7KIkV-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/zsmalloc.c: In function '__free_zspage':
>> mm/zsmalloc.c:813:24: error: implicit declaration of function 'zpdesc_objcgs'; did you mean 'zpdesc_lock'? [-Wimplicit-function-declaration]
813 | bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
| ^~~~~~~~~~~~~
| zpdesc_lock
vim +813 mm/zsmalloc.c
808
809 static void __free_zspage(struct zs_pool *pool, struct size_class *class,
810 struct zspage *zspage)
811 {
812 struct zpdesc *zpdesc, *next;
> 813 bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc);
814
815 assert_spin_locked(&class->lock);
816
817 VM_BUG_ON(get_zspage_inuse(zspage));
818 VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
819
820 next = zpdesc = get_first_zpdesc(zspage);
821 do {
822 VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
823 next = get_next_zpdesc(zpdesc);
824 reset_zpdesc(zpdesc);
825 zpdesc_unlock(zpdesc);
826 zpdesc_dec_zone_page_state(zpdesc);
827 if (objcg)
828 dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B);
829 zpdesc_put(zpdesc);
830 zpdesc = next;
831 } while (zpdesc != NULL);
832
833 cache_free_zspage(zspage);
834
835 class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
836 atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
837 }
838
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting
2026-02-26 19:29 [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Joshua Hahn
` (2 preceding siblings ...)
2026-02-26 19:29 ` [PATCH 8/8] mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec Joshua Hahn
@ 2026-03-02 21:31 ` Nhat Pham
2026-03-03 17:51 ` Joshua Hahn
3 siblings, 1 reply; 17+ messages in thread
From: Nhat Pham @ 2026-03-02 21:31 UTC (permalink / raw)
To: Joshua Hahn
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Chengming Zhou, Michal Hocko, Roman Gushchin,
Shakeel Butt, Muchun Song, Andrew Morton, cgroups, linux-mm,
linux-kernel, kernel-team
On Thu, Feb 26, 2026 at 11:29 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> INTRODUCTION
> ============
> The current design for zswap and zsmalloc leaves a clean divide between
> layers of the memory stack. At the higher level, we have zswap, which
> interacts directly with memory consumers, compression algorithms, and
> handles memory usage accounting via memcg limits. At the lower level,
> we have zsmalloc, which handles the page allocation and migration of
> physical pages.
>
> While this logical separation simplifies the codebase, it leaves
> problems for accounting that requires both memory cgroup awareness and
> physical memory location. To name a few:
>
> - On tiered systems, it is impossible to understand how much toptier
> memory a cgroup is using, since zswap has no understanding of where
> the compressed memory is physically stored.
> + With SeongJae Park's work to store incompressible pages as-is in
> zswap [1], the size of compressed memory can become non-trivial,
> and easily consume a meaningful portion of memory.
>
> - cgroups that restrict memory nodes have no control over which nodes
> their zswapped objects live on. This can lead to unexpectedly high
> fault times for workloads, who must eat the remote access latency
> cost of retrieving the compressed object from a remote node.
> + Nhat Pham addressed this issue via a best-effort attempt to place
> compressed objects in the same page as the original page, but this
> cannot guarantee complete isolation [2].
>
> - On the flip side, zsmalloc's ignorance of cgroup also makes its
> shrinker memcg-unaware, which can lead to ineffective reclaim when
> pressure is localized to a single cgroup.
>
> Until recently, zpool acted as another layer of indirection between
> zswap and zsmalloc, which made bridging memcg and physical location
> difficult. Now that zsmalloc is the only allocator backend for zswap and
> zram [3], it is possible to move memory-cgroup accounting to the
> zsmalloc layer.
>
> Introduce a new per-zpdesc array of objcg pointers to track
> per-memcg-lruvec memory usage by zswap, while leaving zram users
> unaffected.
>
> This creates one source of truth for NR_ZSWAP, and more accurate
> accounting for NR_ZSWAPPED.
>
> This brings sizeof(struct zpdesc) from 56 bytes to 64 bytes, but this
> increase in size is unseen by the rest of the system because zpdesc
> overlays struct page. Implementation details and care taken to handle
> the page->memcg_data field can be found in patch 3.
>
> In addition, move the accounting of memcg charges to the zsmalloc layer,
> whose only user is zswap at the moment.
>
> PATCH OUTLINE
> =============
> Patches 1 and 2 are small cleanups that make the codebase consistent and
> easier to digest.
>
> Patches 3, 4, and 5 allocate and populate the new zpdesc->objcgs field
> with compressed objects' obj_cgroups. zswap_entry->objcgs is removed,
> and redirected to look at the zspage for memcg information.
>
> Patch 6 moves the charging and lifetime management of obj_cgroups to
> the zsmalloc layer, which leaves zswap only as a plumbing layer to hand
> cgroup information to zsmalloc.
>
> Patches 7 and 8 introduce node counters and memcg-lruvec counters for
> zswap. Special care is taken for compressed objects that span multiple
> nodes.
>
> [1] https://lore.kernel.org/linux-mm/20250822190817.49287-1-sj@kernel.org/
> [2] https://lore.kernel.org/linux-mm/20250402204416.3435994-1-nphamcs@gmail.com/#t3
> [3] https://lore.kernel.org/linux-mm/20250829162212.208258-1-hannes@cmpxchg.org/
> [4] https://lore.kernel.org/linux-mm/c8bc2dce-d4ec-c16e-8df4-2624c48cfc06@google.com/
>
> Joshua Hahn (8):
> mm/zsmalloc: Rename zs_object_copy to zs_obj_copy
> mm/zsmalloc: Make all obj_idx unsigned ints
> mm/zsmalloc: Introduce objcgs pointer in struct zpdesc
> mm/zsmalloc: Store obj_cgroup pointer in zpdesc
> mm/zsmalloc,zswap: Redirect zswap_entry->obcg to zpdesc
> mm/zsmalloc, zswap: Handle objcg charging and lifetime in zsmalloc
> mm/memcontrol: Track MEMCG_ZSWAPPED in bytes
> mm/vmstat, memcontrol: Track ZSWAP_B, ZSWAPPED_B per-memcg-lruvec
>
> drivers/block/zram/zram_drv.c | 17 +-
> include/linux/memcontrol.h | 15 +-
> include/linux/mmzone.h | 2 +
> include/linux/zsmalloc.h | 6 +-
> mm/memcontrol.c | 68 ++------
> mm/vmstat.c | 2 +
> mm/zpdesc.h | 25 ++-
> mm/zsmalloc.c | 282 ++++++++++++++++++++++++++++++++--
> mm/zswap.c | 67 ++++----
> 9 files changed, 345 insertions(+), 139 deletions(-)
I might have missed it and this might be in one of the latter patches,
but could also add some quick and dirty benchmark for zswap to ensure
there's no or minimal performance implications? IIUC there is a small
amount of extra overhead in certain steps, because we have to go
through zsmalloc to query objcg. Usemem or kernel build should suffice
IMHO.
To be clear, I don't anticipate any observable performance change, but
it's a good sanity check :) Besides, can't be too careful with stress
testing stuff :P
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting
2026-03-02 21:31 ` [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting Nhat Pham
@ 2026-03-03 17:51 ` Joshua Hahn
2026-03-03 18:01 ` Nhat Pham
0 siblings, 1 reply; 17+ messages in thread
From: Joshua Hahn @ 2026-03-03 17:51 UTC (permalink / raw)
To: Nhat Pham
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Chengming Zhou, Michal Hocko, Roman Gushchin,
Shakeel Butt, Muchun Song, Andrew Morton, cgroups, linux-mm,
linux-kernel, kernel-team
On Mon, 2 Mar 2026 13:31:32 -0800 Nhat Pham <nphamcs@gmail.com> wrote:
> On Thu, Feb 26, 2026 at 11:29 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
[...snip...]
> > Introduce a new per-zpdesc array of objcg pointers to track
> > per-memcg-lruvec memory usage by zswap, while leaving zram users
> > unaffected.
[...snip...]
Hi Nhat! I hope you are doing well : -) Thank you for taking a look!
> I might have missed it and this might be in one of the latter patches,
> but could also add some quick and dirty benchmark for zswap to ensure
> there's no or minimal performance implications? IIUC there is a small
> amount of extra overhead in certain steps, because we have to go
> through zsmalloc to query objcg. Usemem or kernel build should suffice
> IMHO.
Yup, this was one of my concerns too. I tried to do a somewhat comprehensive
analysis below, hopefully this can show a good picture of what's happening.
Spoilers: there doesn't seem to be any significant regressions (< 1%)
and any regressions are within a small fraction of the standard deviation.
One thing that I have noticed is that there is a tangible reduction in
standard deviation for some of these benchmarks. I can't exactly pinpoint
why this is happening, but I'll take it as a win :p
> To be clear, I don't anticipate any observable performance change, but
> it's a good sanity check :) Besides, can't be too careful with stress
> testing stuff :P
For sure. I should have done these and included it in the original RFC,
but I think I might have been too eager to get the RFC out : -)
Will include in the second version of the series!
All the experiments below are done on a 2-NUMA system. The data is quite
compressible, which I think makes sense for measuring the overhead of accounting.
Benchmark 1
Allocating 2G memory to one node with 1G memory.high. Average across 10 trials
+-------------------------+---------+----------+
| | average | stddev |
+-------------------------+---------+----------+
| Baseline (11439c4635ed) | 8887.82 | 362.40 |
| Baseline + Series | 8944.16 | 356.45 |
+-------------------------+---------+----------+
| Delta | +0.634% | -1.642% |
+-------------------------+---------+----------+
Benchmark 2
Allocating 2G memory to one node with 1G memory.high, churn 5x through the
memory. Average across 5 trials.
+-------------------------+----------+----------+
| | average | stddev |
+-------------------------+----------+----------+
| Baseline (11439c4635ed) | 31152.96 | 166.23 |
| Baseline + Series | 31355.28 | 64.86 |
+-------------------------+----------+----------+
| Delta | +0.649% | -60.981% |
+-------------------------+----------+----------+
Benchmark 3
Allocating 2G memory to one node with 1G memory.high, split across 2 nodes.
Average across 5 trials.
+-------------------------+---------+----------+
| a | average | stddev |
+-------------------------+---------+----------+
| Baseline (11439c4635ed) | 16101.6 | 174.18 |
| Baseline + Series | 16022.4 | 117.17 |
+-------------------------+---------+----------+
| Delta | -0.492% | -32.731% |
+-------------------------+---------+----------+
Benchmark 4
Reading stat files 10000 times under memory pressure
memory.stat
+-------------------------+---------+----------+
| | average | stddev |
+-------------------------+---------+----------+
| Baseline (11439c4635ed) | 24524.4 | 501.7 |
| Baseline + Series | 24807.2 | 444.53 |
+-------------------------+---------+---------+
| Delta | 1.153% | -11.395% |
+-------------------------+---------+----------+
memory.numa_stat
+-------------------------+---------+---------+
| | average | stddev |
+-------------------------+---------+---------+
| Baseline (11439c4635ed) | 24807.2 | 444.53 |
| Baseline + Series | 23837.6 | 521.68 |
+-------------------------+---------+---------+
| Delta | -3.905% | 17.355% |
+-------------------------+---------+---------+
proc/vmstat
+-------------------------+---------+----------+
| | average | stddev |
+-------------------------+---------+----------+
| Baseline (11439c4635ed) | 24793.6 | 285.26 |
| Baseline + Series | 23815.6 | 553.44 |
+-------------------------+---------+---------+
| Delta | -3.945% | +94.012% |
+-------------------------+---------+----------+
^^^ Some big increase in standard deviation here, although there is some
decrease in the average time. Probably the most notable change that I've seen
from this patch.
node0/vmstat
+-------------------------+---------+----------+
| a | average | stddev |
+-------------------------+---------+----------+
| Baseline (11439c4635ed) | 24541.4 | 281.41 |
| Baseline + Series | 24479 | 241.29 |
+-------------------------+---------+---------+
| Delta | -0.254% | -14.257% |
+-------------------------+---------+----------+
Lots of testing results, I think mostly negligible in terms of average, but
some non-negligible changes in standard deviation going in both directions.
I don't see anything too concerning off the top of my head, but for the
next version I'll try to do some more testing across different machines
as well (I don't have any machines with > 2 nodes, but maybe I can do
some tests on QEMU just to sanity check)
Thanks again, Nhat. Have a great day!
Joshua
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 0/8] mm/zswap, zsmalloc: Per-memcg-lruvec zswap accounting
2026-03-03 17:51 ` Joshua Hahn
@ 2026-03-03 18:01 ` Nhat Pham
0 siblings, 0 replies; 17+ messages in thread
From: Nhat Pham @ 2026-03-03 18:01 UTC (permalink / raw)
To: Joshua Hahn
Cc: Minchan Kim, Sergey Senozhatsky, Johannes Weiner, Yosry Ahmed,
Nhat Pham, Chengming Zhou, Michal Hocko, Roman Gushchin,
Shakeel Butt, Muchun Song, Andrew Morton, cgroups, linux-mm,
linux-kernel, kernel-team
On Tue, Mar 3, 2026 at 9:51 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> On Mon, 2 Mar 2026 13:31:32 -0800 Nhat Pham <nphamcs@gmail.com> wrote:
>
> > On Thu, Feb 26, 2026 at 11:29 AM Joshua Hahn <joshua.hahnjy@gmail.com> wrote:
>
> [...snip...]
>
> > > Introduce a new per-zpdesc array of objcg pointers to track
> > > per-memcg-lruvec memory usage by zswap, while leaving zram users
> > > unaffected.
>
> [...snip...]
>
> Hi Nhat! I hope you are doing well : -) Thank you for taking a look!
>
> > I might have missed it and this might be in one of the latter patches,
> > but could also add some quick and dirty benchmark for zswap to ensure
> > there's no or minimal performance implications? IIUC there is a small
> > amount of extra overhead in certain steps, because we have to go
> > through zsmalloc to query objcg. Usemem or kernel build should suffice
> > IMHO.
>
> Yup, this was one of my concerns too. I tried to do a somewhat comprehensive
> analysis below, hopefully this can show a good picture of what's happening.
> Spoilers: there doesn't seem to be any significant regressions (< 1%)
> and any regressions are within a small fraction of the standard deviation.
>
> One thing that I have noticed is that there is a tangible reduction in
> standard deviation for some of these benchmarks. I can't exactly pinpoint
> why this is happening, but I'll take it as a win :p
>
> > To be clear, I don't anticipate any observable performance change, but
> > it's a good sanity check :) Besides, can't be too careful with stress
> > testing stuff :P
>
> For sure. I should have done these and included it in the original RFC,
> but I think I might have been too eager to get the RFC out : -)
> Will include in the second version of the series!
>
> All the experiments below are done on a 2-NUMA system. The data is quite
> compressible, which I think makes sense for measuring the overhead of accounting.
>
> Benchmark 1
> Allocating 2G memory to one node with 1G memory.high. Average across 10 trials
> +-------------------------+---------+----------+
> | | average | stddev |
> +-------------------------+---------+----------+
> | Baseline (11439c4635ed) | 8887.82 | 362.40 |
> | Baseline + Series | 8944.16 | 356.45 |
> +-------------------------+---------+----------+
> | Delta | +0.634% | -1.642% |
> +-------------------------+---------+----------+
>
> Benchmark 2
> Allocating 2G memory to one node with 1G memory.high, churn 5x through the
> memory. Average across 5 trials.
> +-------------------------+----------+----------+
> | | average | stddev |
> +-------------------------+----------+----------+
> | Baseline (11439c4635ed) | 31152.96 | 166.23 |
> | Baseline + Series | 31355.28 | 64.86 |
> +-------------------------+----------+----------+
> | Delta | +0.649% | -60.981% |
> +-------------------------+----------+----------+
>
> Benchmark 3
> Allocating 2G memory to one node with 1G memory.high, split across 2 nodes.
> Average across 5 trials.
> +-------------------------+---------+----------+
> | a | average | stddev |
> +-------------------------+---------+----------+
> | Baseline (11439c4635ed) | 16101.6 | 174.18 |
> | Baseline + Series | 16022.4 | 117.17 |
> +-------------------------+---------+----------+
> | Delta | -0.492% | -32.731% |
> +-------------------------+---------+----------+
>
> Benchmark 4
> Reading stat files 10000 times under memory pressure
>
> memory.stat
> +-------------------------+---------+----------+
> | | average | stddev |
> +-------------------------+---------+----------+
> | Baseline (11439c4635ed) | 24524.4 | 501.7 |
> | Baseline + Series | 24807.2 | 444.53 |
> +-------------------------+---------+---------+
> | Delta | 1.153% | -11.395% |
> +-------------------------+---------+----------+
>
> memory.numa_stat
> +-------------------------+---------+---------+
> | | average | stddev |
> +-------------------------+---------+---------+
> | Baseline (11439c4635ed) | 24807.2 | 444.53 |
> | Baseline + Series | 23837.6 | 521.68 |
> +-------------------------+---------+---------+
> | Delta | -3.905% | 17.355% |
> +-------------------------+---------+---------+
>
> proc/vmstat
> +-------------------------+---------+----------+
> | | average | stddev |
> +-------------------------+---------+----------+
> | Baseline (11439c4635ed) | 24793.6 | 285.26 |
> | Baseline + Series | 23815.6 | 553.44 |
> +-------------------------+---------+---------+
> | Delta | -3.945% | +94.012% |
> +-------------------------+---------+----------+
>
> ^^^ Some big increase in standard deviation here, although there is some
> decrease in the average time. Probably the most notable change that I've seen
> from this patch.
>
> node0/vmstat
> +-------------------------+---------+----------+
> | a | average | stddev |
> +-------------------------+---------+----------+
> | Baseline (11439c4635ed) | 24541.4 | 281.41 |
> | Baseline + Series | 24479 | 241.29 |
> +-------------------------+---------+---------+
> | Delta | -0.254% | -14.257% |
> +-------------------------+---------+----------+
>
> Lots of testing results, I think mostly negligible in terms of average, but
> some non-negligible changes in standard deviation going in both directions.
> I don't see anything too concerning off the top of my head, but for the
> next version I'll try to do some more testing across different machines
> as well (I don't have any machines with > 2 nodes, but maybe I can do
> some tests on QEMU just to sanity check)
>
> Thanks again, Nhat. Have a great day!
> Joshua
Sounds like any meagre performance difference is smaller than noise :P
If it's this negligible on these microbenchmarks, I think they'll be
infinitesimal in production workloads where these operations are a
very small part.
Kinda makes sense, because objcgroup access is only done in very small
subsets of operations: zswap entry store and zswap entry free, which
can only happen once each per zswap entry.
I think we're fine, but I'll leave other reviewers comment on it as well.
^ permalink raw reply [flat|nested] 17+ messages in thread