From: Nhat Pham <nphamcs@gmail.com>
To: kasong@tencent.com
Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org,
apopple@nvidia.com, axelrasmussen@google.com, baohua@kernel.org,
baolin.wang@linux.alibaba.com, bhe@redhat.com, byungchul@sk.com,
cgroups@vger.kernel.org, chengming.zhou@linux.dev,
chrisl@kernel.org, corbet@lwn.net, david@kernel.org,
dev.jain@arm.com, gourry@gourry.net, hannes@cmpxchg.org,
hughd@google.com, jannh@google.com, joshua.hahnjy@gmail.com,
lance.yang@linux.dev, lenb@kernel.org, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-pm@vger.kernel.org, lorenzo.stoakes@oracle.com,
matthew.brost@intel.com, mhocko@suse.com, muchun.song@linux.dev,
npache@redhat.com, nphamcs@gmail.com, pavel@kernel.org,
peterx@redhat.com, peterz@infradead.org, pfalcato@suse.de,
rafael@kernel.org, rakie.kim@sk.com, roman.gushchin@linux.dev,
rppt@kernel.org, ryan.roberts@arm.com, shakeel.butt@linux.dev,
shikemeng@huaweicloud.com, surenb@google.com, tglx@kernel.org,
vbabka@suse.cz, weixugc@google.com, ying.huang@linux.alibaba.com,
yosry.ahmed@linux.dev, yuanchu@google.com,
zhengqi.arch@bytedance.com, ziy@nvidia.com, kernel-team@meta.com,
riel@surriel.com, haowenchao22@gmail.com
Subject: [RFC PATCH 4/5] mm, swap: only charge physical swap entries
Date: Thu, 28 May 2026 14:29:28 -0700 [thread overview]
Message-ID: <20260528212955.1912856-5-nphamcs@gmail.com> (raw)
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>
Stop double-charging vswap entries against memcg->swap. Previously,
the entry was charged once at vswap allocation (via
mem_cgroup_try_charge_swap) and implicitly again when physical
backing was allocated.
Split the lifecycle into four operations: record the memcg private
ID at vswap alloc without charging; charge memcg->swap only when
physical backing is allocated via folio_realloc_swap; uncharge in
vswap_release_backing (only nr_swapfile entries on v2, all nr on
v1 memsw); and drop the ID ref at __swap_cluster_free_entries
without uncharging.
Direct-mapped physical swap charging is unchanged.
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
include/linux/swap.h | 57 +++++++++++++++++++++
mm/memcontrol.c | 118 +++++++++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 109 ++++++++++++++++++++++++++++++++++++---
3 files changed, 276 insertions(+), 8 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3fb55485fc76..6f18ecdf0bb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -597,6 +597,43 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return __mem_cgroup_try_charge_swap(folio);
}
+extern void __mem_cgroup_record_swap(struct folio *folio);
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_record_swap(folio);
+}
+
+extern int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_charge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_id_put_swap(id, nr_pages);
+}
+
extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
@@ -613,6 +650,26 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return 0;
}
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+}
+
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ return 0;
+}
+
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+}
+
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+ unsigned int nr_pages)
+{
+}
+
static inline void mem_cgroup_uncharge_swap(unsigned short id,
unsigned int nr_pages)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7492879b3239..91618da7ec20 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5513,6 +5513,124 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
return 0;
}
+/**
+ * __mem_cgroup_record_swap - record memcg for swap without charging
+ * @folio: folio being added to swap
+ *
+ * Pin the memcg private ID ref and record it in the swap cgroup table,
+ * but do not charge memcg->swap. Used for vswap entries where the charge
+ * is deferred until physical backing is allocated.
+ */
+void __mem_cgroup_record_swap(struct folio *folio)
+{
+ unsigned int nr_pages = folio_nr_pages(folio);
+ struct swap_cluster_info *ci;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
+
+ if (do_memsw_account())
+ return;
+
+ objcg = folio_objcg(folio);
+ if (!objcg)
+ return;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ if (!folio_test_swapcache(folio)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ memcg = mem_cgroup_private_id_get_online(memcg, nr_pages);
+ rcu_read_unlock();
+
+ ci = swap_cluster_get_and_lock(folio);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+ mem_cgroup_private_id(memcg));
+ swap_cluster_unlock(ci);
+}
+
+/**
+ * __mem_cgroup_charge_backing_phys_swap - charge memcg->swap counter only
+ * @memcg: the mem_cgroup to charge (may be NULL)
+ * @nr_pages: number of physical swap pages to charge
+ *
+ * Unlike __mem_cgroup_try_charge_swap(), this does NOT touch the memcg
+ * private ID refcount — the ID ref was pinned earlier by
+ * __mem_cgroup_record_swap() at vswap allocation time and lives for the
+ * lifetime of the vswap entry. This helper only updates the swap counter
+ * when a vswap entry transitions to physical backing (folio_realloc_swap),
+ * so the counter and the ID ref can be managed independently.
+ *
+ * The caller resolves the memcg (typically via folio_memcg + ID
+ * comparison to avoid IDR lookups on the hot path).
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ struct page_counter *counter;
+
+ if (do_memsw_account())
+ return 0;
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return -ENOMEM;
+ }
+ mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ return 0;
+}
+
+/**
+ * __mem_cgroup_uncharge_backing_phys_swap - uncharge memcg->swap counter only
+ * @memcg: the mem_cgroup to uncharge (may be NULL)
+ * @nr_pages: number of physical swap pages to uncharge
+ *
+ * Unlike __mem_cgroup_uncharge_swap(), this does NOT drop the memcg
+ * private ID refcount — that ref is dropped separately via
+ * __mem_cgroup_id_put_swap() when the vswap entry itself is freed.
+ * This helper only updates the swap counter when physical backing is
+ * released (vswap_release_backing), so the counter and ID ref can be
+ * managed independently.
+ */
+void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!memcg)
+ return;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ else
+ page_counter_uncharge(&memcg->swap, nr_pages);
+ }
+ mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
+}
+
+/**
+ * __mem_cgroup_id_put_swap - drop memcg private ID ref without uncharging
+ * @id: cgroup private id
+ * @nr_pages: number of refs to drop
+ */
+void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_private_id(id);
+ if (memcg)
+ mem_cgroup_private_id_put(memcg, nr_pages);
+ rcu_read_unlock();
+}
+
/**
* __mem_cgroup_uncharge_swap - uncharge swap space
* @id: cgroup id to uncharge
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a0976be6a12b..be901fb741e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
+#include "memcontrol-v1.h"
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
@@ -2043,8 +2044,15 @@ int folio_alloc_swap(struct folio *folio)
goto again;
}
- /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
- if (unlikely(mem_cgroup_try_charge_swap(folio)))
+ /*
+ * Vswap entries: record memcg ID without charging — the charge is
+ * deferred to folio_realloc_swap when physical backing is allocated.
+ * Direct-mapped physical swap entries: charge immediately as today.
+ */
+ if (folio_test_swapcache(folio) &&
+ swap_is_vswap(__swap_entry_to_info(folio->swap)))
+ mem_cgroup_record_swap(folio);
+ else if (unlikely(mem_cgroup_try_charge_swap(folio)))
swap_cache_del_folio(folio);
if (unlikely(!folio_test_swapcache(folio)))
@@ -2096,6 +2104,26 @@ static void __swap_cluster_free_phys_backing(struct swap_info_struct *psi,
unsigned int ci_start,
unsigned int nr_pages);
+static void vswap_uncharge_cgroup_batch(unsigned short memcg_id,
+ unsigned int batch_nr,
+ unsigned int batch_nr_swapfile)
+{
+ struct mem_cgroup *memcg;
+ unsigned int n;
+
+ if (do_memsw_account())
+ n = batch_nr;
+ else
+ n = batch_nr_swapfile;
+ if (!n)
+ return;
+
+ rcu_read_lock();
+ memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+ rcu_read_unlock();
+ mem_cgroup_uncharge_backing_phys_swap(memcg, n);
+}
+
void vswap_release_backing(struct swap_cluster_info *ci,
unsigned int ci_start, unsigned int nr)
{
@@ -2106,12 +2134,36 @@ void vswap_release_backing(struct swap_cluster_info *ci,
unsigned int ci_off;
unsigned long vt;
swp_entry_t phys;
+ /*
+ * Per-cgroup uncharge batching: a single vswap_release_backing
+ * call can span multiple cgroups (e.g. batched free across
+ * folios), so we cannot uncharge with the first slot's memcg
+ * for the whole range.
+ */
+ unsigned short batch_id;
+ unsigned int batch_nr = 0, batch_nr_swapfile = 0;
lockdep_assert_held(&ci->lock);
ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+ batch_id = __swap_cgroup_get(ci, ci_start);
for (ci_off = ci_start; ci_off < ci_start + nr; ci_off++) {
+ unsigned short cur_id;
+
vt = __vtable_get(ci_dyn, ci_off);
+ cur_id = __swap_cgroup_get(ci, ci_off);
+
+ /*
+ * Flush per-cgroup uncharge when crossing a cgroup boundary.
+ */
+ if (cur_id != batch_id) {
+ vswap_uncharge_cgroup_batch(batch_id, batch_nr,
+ batch_nr_swapfile);
+ batch_id = cur_id;
+ batch_nr = 0;
+ batch_nr_swapfile = 0;
+ }
+ batch_nr++;
/*
* Flush batched physical slots when the next entry
@@ -2135,6 +2187,7 @@ void vswap_release_backing(struct swap_cluster_info *ci,
switch (vtable_type(vt)) {
case VSWAP_SWAPFILE:
+ batch_nr_swapfile++;
if (!phys_start) {
phys = vtable_to_phys(vt);
phys_start = swp_offset(phys);
@@ -2165,6 +2218,9 @@ void vswap_release_backing(struct swap_cluster_info *ci,
phys_start % SWAPFILE_CLUSTER,
phys_end - phys_start);
}
+
+ /* Final cgroup-batch flush. */
+ vswap_uncharge_cgroup_batch(batch_id, batch_nr, batch_nr_swapfile);
}
void vswap_store_folio(swp_entry_t entry, struct folio *folio)
@@ -2222,7 +2278,9 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
swp_entry_t vswap_entry = folio->swap;
struct swap_cluster_info *ci;
struct swap_cluster_info_dynamic *ci_dyn;
+ struct mem_cgroup *memcg;
unsigned int voff;
+ unsigned short memcg_id;
swp_entry_t phys_entry = {};
swp_entry_t pe;
int i, nr = folio_nr_pages(folio);
@@ -2245,9 +2303,33 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
return (swp_entry_t){};
voff = swp_cluster_offset(vswap_entry);
-
ci = __swap_entry_to_cluster(vswap_entry);
ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+
+ /*
+ * Resolve the memcg for physical swap charging. Compare
+ * folio_memcg against the recorded swap memcg ID — on match
+ * (common case), zero IDR lookups. Only fall back to IDR
+ * lookup on mismatch (task migrated cgroups).
+ */
+ spin_lock(&ci->lock);
+ memcg_id = __swap_cgroup_get(ci, voff);
+ spin_unlock(&ci->lock);
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ if (!memcg || mem_cgroup_private_id(memcg) != memcg_id)
+ memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+ rcu_read_unlock();
+
+ if (mem_cgroup_charge_backing_phys_swap(memcg, nr)) {
+ __swap_cluster_free_phys_backing(
+ __swap_entry_to_info(phys_entry),
+ __swap_entry_to_cluster(phys_entry),
+ swp_cluster_offset(phys_entry), nr);
+ return (swp_entry_t){};
+ }
+
spin_lock(&ci->lock);
/*
* Install PHYS backing without freeing any prior contents of the
@@ -2468,10 +2550,11 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
unsigned short batch_id = 0, id_cur;
unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
unsigned int batch_off = ci_off;
+ bool is_vswap = swap_is_vswap(si);
VM_WARN_ON(ci->count < nr_pages);
- if (swap_is_vswap(si))
+ if (is_vswap)
vswap_release_backing(ci, ci_start, nr_pages);
ci->count -= nr_pages;
@@ -2491,18 +2574,28 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
/*
* Uncharge swap slots by memcg in batches. Consecutive
* slots with the same cgroup id are uncharged together.
+ * For vswap, only drop the ID ref — physical swap was
+ * already uncharged in vswap_release_backing above.
*/
id_cur = __swap_cgroup_clear(ci, ci_off, 1);
if (batch_id != id_cur) {
- if (batch_id)
- mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ if (batch_id) {
+ if (is_vswap)
+ mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+ else
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ }
batch_id = id_cur;
batch_off = ci_off;
}
} while (++ci_off < ci_end);
- if (batch_id)
- mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ if (batch_id) {
+ if (is_vswap)
+ mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+ else
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ }
__swap_cluster_finish_free(si, ci, ci_start, nr_pages);
}
--
2.53.0-Meta
next prev parent reply other threads:[~2026-05-28 21:30 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-28 21:29 [RFC PATCH 0/5] mm, swap: Virtual Swap Space (Swap Table Edition) Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 1/5] mm, swap: add virtual swap device infrastructure Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 2/5] mm, swap: support zswap and zeroswap as vswap backends Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 3/5] mm, swap: support physical swap as a vswap backend Nhat Pham
2026-05-28 21:29 ` Nhat Pham [this message]
2026-05-28 21:29 ` [RFC PATCH 5/5] mm, swap: add debugfs counters for vswap Nhat Pham
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260528212955.1912856-5-nphamcs@gmail.com \
--to=nphamcs@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=haowenchao22@gmail.com \
--cc=hughd@google.com \
--cc=jannh@google.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=lenb@kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-pm@vger.kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=matthew.brost@intel.com \
--cc=mhocko@suse.com \
--cc=muchun.song@linux.dev \
--cc=npache@redhat.com \
--cc=pavel@kernel.org \
--cc=peterx@redhat.com \
--cc=peterz@infradead.org \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=surenb@google.com \
--cc=tglx@kernel.org \
--cc=vbabka@suse.cz \
--cc=weixugc@google.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yosry.ahmed@linux.dev \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox