From: Nhat Pham <nphamcs@gmail.com>
To: kasong@tencent.com
Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org,
apopple@nvidia.com, axelrasmussen@google.com, baohua@kernel.org,
baolin.wang@linux.alibaba.com, bhe@redhat.com, byungchul@sk.com,
cgroups@vger.kernel.org, chengming.zhou@linux.dev,
chrisl@kernel.org, corbet@lwn.net, david@kernel.org,
dev.jain@arm.com, gourry@gourry.net, hannes@cmpxchg.org,
hughd@google.com, jannh@google.com, joshua.hahnjy@gmail.com,
lance.yang@linux.dev, lenb@kernel.org, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-pm@vger.kernel.org, lorenzo.stoakes@oracle.com,
matthew.brost@intel.com, mhocko@suse.com, muchun.song@linux.dev,
npache@redhat.com, nphamcs@gmail.com, pavel@kernel.org,
peterx@redhat.com, peterz@infradead.org, pfalcato@suse.de,
rafael@kernel.org, rakie.kim@sk.com, roman.gushchin@linux.dev,
rppt@kernel.org, ryan.roberts@arm.com, shakeel.butt@linux.dev,
shikemeng@huaweicloud.com, surenb@google.com, tglx@kernel.org,
vbabka@suse.cz, weixugc@google.com, ying.huang@linux.alibaba.com,
yosry.ahmed@linux.dev, yuanchu@google.com,
zhengqi.arch@bytedance.com, ziy@nvidia.com, kernel-team@meta.com,
riel@surriel.com, haowenchao22@gmail.com
Subject: [RFC PATCH 4/5] mm, swap: only charge physical swap entries
Date: Thu, 28 May 2026 14:29:28 -0700 [thread overview]
Message-ID: <20260528212955.1912856-5-nphamcs@gmail.com> (raw)
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>
Stop double-charging vswap entries against memcg->swap. Previously,
the entry was charged once at vswap allocation (via
mem_cgroup_try_charge_swap) and implicitly again when physical
backing was allocated.
Split the lifecycle into four operations: record the memcg private
ID at vswap alloc without charging; charge memcg->swap only when
physical backing is allocated via folio_realloc_swap; uncharge in
vswap_release_backing (only nr_swapfile entries on v2, all nr on
v1 memsw); and drop the ID ref at __swap_cluster_free_entries
without uncharging.
Direct-mapped physical swap charging is unchanged.
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
include/linux/swap.h | 57 +++++++++++++++++++++
mm/memcontrol.c | 118 +++++++++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 109 ++++++++++++++++++++++++++++++++++++---
3 files changed, 276 insertions(+), 8 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3fb55485fc76..6f18ecdf0bb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -597,6 +597,43 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return __mem_cgroup_try_charge_swap(folio);
}
+extern void __mem_cgroup_record_swap(struct folio *folio);
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_record_swap(folio);
+}
+
+extern int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return __mem_cgroup_charge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_uncharge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_id_put_swap(id, nr_pages);
+}
+
extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
@@ -613,6 +650,26 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return 0;
}
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+}
+
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ return 0;
+}
+
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+}
+
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+ unsigned int nr_pages)
+{
+}
+
static inline void mem_cgroup_uncharge_swap(unsigned short id,
unsigned int nr_pages)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7492879b3239..91618da7ec20 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5513,6 +5513,124 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
return 0;
}
+/**
+ * __mem_cgroup_record_swap - record memcg for swap without charging
+ * @folio: folio being added to swap
+ *
+ * Pin the memcg private ID ref and record it in the swap cgroup table,
+ * but do not charge memcg->swap. Used for vswap entries where the charge
+ * is deferred until physical backing is allocated.
+ */
+void __mem_cgroup_record_swap(struct folio *folio)
+{
+ unsigned int nr_pages = folio_nr_pages(folio);
+ struct swap_cluster_info *ci;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
+
+ if (do_memsw_account())
+ return;
+
+ objcg = folio_objcg(folio);
+ if (!objcg)
+ return;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ if (!folio_test_swapcache(folio)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ memcg = mem_cgroup_private_id_get_online(memcg, nr_pages);
+ rcu_read_unlock();
+
+ ci = swap_cluster_get_and_lock(folio);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+ mem_cgroup_private_id(memcg));
+ swap_cluster_unlock(ci);
+}
+
+/**
+ * __mem_cgroup_charge_backing_phys_swap - charge memcg->swap counter only
+ * @memcg: the mem_cgroup to charge (may be NULL)
+ * @nr_pages: number of physical swap pages to charge
+ *
+ * Unlike __mem_cgroup_try_charge_swap(), this does NOT touch the memcg
+ * private ID refcount — the ID ref was pinned earlier by
+ * __mem_cgroup_record_swap() at vswap allocation time and lives for the
+ * lifetime of the vswap entry. This helper only updates the swap counter
+ * when a vswap entry transitions to physical backing (folio_realloc_swap),
+ * so the counter and the ID ref can be managed independently.
+ *
+ * The caller resolves the memcg (typically via folio_memcg + ID
+ * comparison to avoid IDR lookups on the hot path).
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ struct page_counter *counter;
+
+ if (do_memsw_account())
+ return 0;
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return -ENOMEM;
+ }
+ mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ return 0;
+}
+
+/**
+ * __mem_cgroup_uncharge_backing_phys_swap - uncharge memcg->swap counter only
+ * @memcg: the mem_cgroup to uncharge (may be NULL)
+ * @nr_pages: number of physical swap pages to uncharge
+ *
+ * Unlike __mem_cgroup_uncharge_swap(), this does NOT drop the memcg
+ * private ID refcount — that ref is dropped separately via
+ * __mem_cgroup_id_put_swap() when the vswap entry itself is freed.
+ * This helper only updates the swap counter when physical backing is
+ * released (vswap_release_backing), so the counter and ID ref can be
+ * managed independently.
+ */
+void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!memcg)
+ return;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ else
+ page_counter_uncharge(&memcg->swap, nr_pages);
+ }
+ mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
+}
+
+/**
+ * __mem_cgroup_id_put_swap - drop memcg private ID ref without uncharging
+ * @id: cgroup private id
+ * @nr_pages: number of refs to drop
+ */
+void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_private_id(id);
+ if (memcg)
+ mem_cgroup_private_id_put(memcg, nr_pages);
+ rcu_read_unlock();
+}
+
/**
* __mem_cgroup_uncharge_swap - uncharge swap space
* @id: cgroup id to uncharge
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a0976be6a12b..be901fb741e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
+#include "memcontrol-v1.h"
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
@@ -2043,8 +2044,15 @@ int folio_alloc_swap(struct folio *folio)
goto again;
}
- /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
- if (unlikely(mem_cgroup_try_charge_swap(folio)))
+ /*
+ * Vswap entries: record memcg ID without charging — the charge is
+ * deferred to folio_realloc_swap when physical backing is allocated.
+ * Direct-mapped physical swap entries: charge immediately as today.
+ */
+ if (folio_test_swapcache(folio) &&
+ swap_is_vswap(__swap_entry_to_info(folio->swap)))
+ mem_cgroup_record_swap(folio);
+ else if (unlikely(mem_cgroup_try_charge_swap(folio)))
swap_cache_del_folio(folio);
if (unlikely(!folio_test_swapcache(folio)))
@@ -2096,6 +2104,26 @@ static void __swap_cluster_free_phys_backing(struct swap_info_struct *psi,
unsigned int ci_start,
unsigned int nr_pages);
+static void vswap_uncharge_cgroup_batch(unsigned short memcg_id,
+ unsigned int batch_nr,
+ unsigned int batch_nr_swapfile)
+{
+ struct mem_cgroup *memcg;
+ unsigned int n;
+
+ if (do_memsw_account())
+ n = batch_nr;
+ else
+ n = batch_nr_swapfile;
+ if (!n)
+ return;
+
+ rcu_read_lock();
+ memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+ rcu_read_unlock();
+ mem_cgroup_uncharge_backing_phys_swap(memcg, n);
+}
+
void vswap_release_backing(struct swap_cluster_info *ci,
unsigned int ci_start, unsigned int nr)
{
@@ -2106,12 +2134,36 @@ void vswap_release_backing(struct swap_cluster_info *ci,
unsigned int ci_off;
unsigned long vt;
swp_entry_t phys;
+ /*
+ * Per-cgroup uncharge batching: a single vswap_release_backing
+ * call can span multiple cgroups (e.g. batched free across
+ * folios), so we cannot uncharge with the first slot's memcg
+ * for the whole range.
+ */
+ unsigned short batch_id;
+ unsigned int batch_nr = 0, batch_nr_swapfile = 0;
lockdep_assert_held(&ci->lock);
ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+ batch_id = __swap_cgroup_get(ci, ci_start);
for (ci_off = ci_start; ci_off < ci_start + nr; ci_off++) {
+ unsigned short cur_id;
+
vt = __vtable_get(ci_dyn, ci_off);
+ cur_id = __swap_cgroup_get(ci, ci_off);
+
+ /*
+ * Flush per-cgroup uncharge when crossing a cgroup boundary.
+ */
+ if (cur_id != batch_id) {
+ vswap_uncharge_cgroup_batch(batch_id, batch_nr,
+ batch_nr_swapfile);
+ batch_id = cur_id;
+ batch_nr = 0;
+ batch_nr_swapfile = 0;
+ }
+ batch_nr++;
/*
* Flush batched physical slots when the next entry
@@ -2135,6 +2187,7 @@ void vswap_release_backing(struct swap_cluster_info *ci,
switch (vtable_type(vt)) {
case VSWAP_SWAPFILE:
+ batch_nr_swapfile++;
if (!phys_start) {
phys = vtable_to_phys(vt);
phys_start = swp_offset(phys);
@@ -2165,6 +2218,9 @@ void vswap_release_backing(struct swap_cluster_info *ci,
phys_start % SWAPFILE_CLUSTER,
phys_end - phys_start);
}
+
+ /* Final cgroup-batch flush. */
+ vswap_uncharge_cgroup_batch(batch_id, batch_nr, batch_nr_swapfile);
}
void vswap_store_folio(swp_entry_t entry, struct folio *folio)
@@ -2222,7 +2278,9 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
swp_entry_t vswap_entry = folio->swap;
struct swap_cluster_info *ci;
struct swap_cluster_info_dynamic *ci_dyn;
+ struct mem_cgroup *memcg;
unsigned int voff;
+ unsigned short memcg_id;
swp_entry_t phys_entry = {};
swp_entry_t pe;
int i, nr = folio_nr_pages(folio);
@@ -2245,9 +2303,33 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
return (swp_entry_t){};
voff = swp_cluster_offset(vswap_entry);
-
ci = __swap_entry_to_cluster(vswap_entry);
ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+
+ /*
+ * Resolve the memcg for physical swap charging. Compare
+ * folio_memcg against the recorded swap memcg ID — on match
+ * (common case), zero IDR lookups. Only fall back to IDR
+ * lookup on mismatch (task migrated cgroups).
+ */
+ spin_lock(&ci->lock);
+ memcg_id = __swap_cgroup_get(ci, voff);
+ spin_unlock(&ci->lock);
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ if (!memcg || mem_cgroup_private_id(memcg) != memcg_id)
+ memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+ rcu_read_unlock();
+
+ if (mem_cgroup_charge_backing_phys_swap(memcg, nr)) {
+ __swap_cluster_free_phys_backing(
+ __swap_entry_to_info(phys_entry),
+ __swap_entry_to_cluster(phys_entry),
+ swp_cluster_offset(phys_entry), nr);
+ return (swp_entry_t){};
+ }
+
spin_lock(&ci->lock);
/*
* Install PHYS backing without freeing any prior contents of the
@@ -2468,10 +2550,11 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
unsigned short batch_id = 0, id_cur;
unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
unsigned int batch_off = ci_off;
+ bool is_vswap = swap_is_vswap(si);
VM_WARN_ON(ci->count < nr_pages);
- if (swap_is_vswap(si))
+ if (is_vswap)
vswap_release_backing(ci, ci_start, nr_pages);
ci->count -= nr_pages;
@@ -2491,18 +2574,28 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
/*
* Uncharge swap slots by memcg in batches. Consecutive
* slots with the same cgroup id are uncharged together.
+ * For vswap, only drop the ID ref — physical swap was
+ * already uncharged in vswap_release_backing above.
*/
id_cur = __swap_cgroup_clear(ci, ci_off, 1);
if (batch_id != id_cur) {
- if (batch_id)
- mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ if (batch_id) {
+ if (is_vswap)
+ mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+ else
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ }
batch_id = id_cur;
batch_off = ci_off;
}
} while (++ci_off < ci_end);
- if (batch_id)
- mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ if (batch_id) {
+ if (is_vswap)
+ mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+ else
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ }
__swap_cluster_finish_free(si, ci, ci_start, nr_pages);
}
--
2.53.0-Meta
next prev parent reply other threads:[~2026-05-28 21:30 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-28 21:29 [RFC PATCH 0/5] mm, swap: Virtual Swap Space (Swap Table Edition) Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 1/5] mm, swap: add virtual swap device infrastructure Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 2/5] mm, swap: support zswap and zeroswap as vswap backends Nhat Pham
2026-05-28 21:29 ` [RFC PATCH 3/5] mm, swap: support physical swap as a vswap backend Nhat Pham
2026-05-28 21:29 ` Nhat Pham [this message]
2026-05-28 21:29 ` [RFC PATCH 5/5] mm, swap: add debugfs counters for vswap Nhat Pham
2026-06-01 7:34 ` [RFC PATCH 0/5] mm, swap: Virtual Swap Space (Swap Table Edition) Kairui Song
2026-06-01 15:56 ` Nhat Pham
2026-06-01 16:22 ` Nhat Pham
2026-06-01 17:49 ` Kairui Song
2026-06-02 15:54 ` Nhat Pham
2026-06-02 16:43 ` Kairui Song
2026-06-01 17:44 ` Kairui Song
2026-06-01 18:06 ` Nhat Pham
2026-06-02 3:24 ` Kairui Song
2026-06-02 15:28 ` Nhat Pham
2026-06-03 1:29 ` Yosry Ahmed
2026-06-03 17:12 ` Nhat Pham
2026-06-03 17:22 ` Nhat Pham
2026-06-03 19:00 ` Yosry Ahmed
2026-06-03 18:58 ` Yosry Ahmed
2026-06-03 19:26 ` Nhat Pham
2026-06-03 19:35 ` Yosry Ahmed
2026-06-03 20:09 ` Nhat Pham
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260528212955.1912856-5-nphamcs@gmail.com \
--to=nphamcs@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=haowenchao22@gmail.com \
--cc=hughd@google.com \
--cc=jannh@google.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=lenb@kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-pm@vger.kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=matthew.brost@intel.com \
--cc=mhocko@suse.com \
--cc=muchun.song@linux.dev \
--cc=npache@redhat.com \
--cc=pavel@kernel.org \
--cc=peterx@redhat.com \
--cc=peterz@infradead.org \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=surenb@google.com \
--cc=tglx@kernel.org \
--cc=vbabka@suse.cz \
--cc=weixugc@google.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yosry.ahmed@linux.dev \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.