linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nhat Pham <nphamcs@gmail.com>
To: linux-mm@kvack.org
Cc: akpm@linux-foundation.org, hannes@cmpxchg.org, hughd@google.com,
	yosry.ahmed@linux.dev, mhocko@kernel.org,
	roman.gushchin@linux.dev, shakeel.butt@linux.dev,
	muchun.song@linux.dev, len.brown@intel.com,
	chengming.zhou@linux.dev, kasong@tencent.com, chrisl@kernel.org,
	huang.ying.caritas@gmail.com, ryan.roberts@arm.com,
	viro@zeniv.linux.org.uk, baohua@kernel.org, osalvador@suse.de,
	lorenzo.stoakes@oracle.com, christophe.leroy@csgroup.eu,
	pavel@kernel.org, kernel-team@meta.com,
	linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
	linux-pm@vger.kernel.org
Subject: [RFC PATCH 11/14] memcg: swap: only charge physical swap slots
Date: Mon,  7 Apr 2025 16:42:12 -0700	[thread overview]
Message-ID: <20250407234223.1059191-12-nphamcs@gmail.com> (raw)
In-Reply-To: <20250407234223.1059191-1-nphamcs@gmail.com>

Now that zswap and the zero-filled swap page optimization no longer
takes up any physical swap space, we should not charge towards the swap
usage and limits of the memcg in these case. We will only record the
memcg id on virtual swap slot allocation, and defer physical swap
charging (i.e towards memory.swap.current) until the virtual swap slot
is backed by an actual physical swap slot (on zswap store failure
fallback or zswap writeback).

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/swap.h |  17 ++++++++
 mm/memcontrol.c      | 102 ++++++++++++++++++++++++++++++++++---------
 mm/vswap.c           |  43 ++++++++----------
 3 files changed, 118 insertions(+), 44 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 073835335667..98cdfe0c1da7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -679,6 +679,23 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
+
+void __mem_cgroup_record_swap(struct folio *folio, swp_entry_t entry);
+static inline void mem_cgroup_record_swap(struct folio *folio,
+		swp_entry_t entry)
+{
+	if (!mem_cgroup_disabled())
+		__mem_cgroup_record_swap(folio, entry);
+}
+
+void __mem_cgroup_unrecord_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_unrecord_swap(swp_entry_t entry,
+		unsigned int nr_pages)
+{
+	if (!mem_cgroup_disabled())
+		__mem_cgroup_unrecord_swap(entry, nr_pages);
+}
+
 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
 static inline int mem_cgroup_try_charge_swap(struct folio *folio,
 		swp_entry_t entry)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 126b2d0e6aaa..c6bee12f2016 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5020,6 +5020,46 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
 	css_put(&memcg->css);
 }
 
+/**
+ * __mem_cgroup_record_swap - record the folio's cgroup for the swap entries.
+ * @folio: folio being swapped out.
+ * @entry: the first swap entry in the range.
+ *
+ * In the virtual swap implementation, we only record the folio's cgroup
+ * for the virtual swap slots on their allocation. We will only charge
+ * physical swap slots towards the cgroup's swap usage, i.e when physical swap
+ * slots are allocated for zswap writeback or fallback from zswap store
+ * failure.
+ */
+void __mem_cgroup_record_swap(struct folio *folio, swp_entry_t entry)
+{
+	unsigned int nr_pages = folio_nr_pages(folio);
+	struct mem_cgroup *memcg;
+
+	memcg = folio_memcg(folio);
+
+	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+	if (!memcg)
+		return;
+
+	memcg = mem_cgroup_id_get_online(memcg);
+	if (nr_pages > 1)
+		mem_cgroup_id_get_many(memcg, nr_pages - 1);
+	swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+}
+
+void __mem_cgroup_unrecord_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+	unsigned short id = swap_cgroup_clear(entry, nr_pages);
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_id(id);
+	if (memcg)
+		mem_cgroup_id_put_many(memcg, nr_pages);
+	rcu_read_unlock();
+}
+
 /**
  * __mem_cgroup_try_charge_swap - try charging swap space for a folio
  * @folio: folio being added to swap
@@ -5038,34 +5078,47 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
 	if (do_memsw_account())
 		return 0;
 
-	memcg = folio_memcg(folio);
+	if (IS_ENABLED(CONFIG_VIRTUAL_SWAP)) {
+		/*
+		 * In the virtual swap implementation, we already record the cgroup
+		 * on virtual swap allocation. Note that the virtual swap slot holds
+		 * a reference to memcg, so this lookup should be safe.
+		 */
+		rcu_read_lock();
+		memcg = mem_cgroup_from_id(lookup_swap_cgroup_id(entry));
+		rcu_read_unlock();
+	} else {
+		memcg = folio_memcg(folio);
 
-	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
-	if (!memcg)
-		return 0;
+		VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+		if (!memcg)
+			return 0;
 
-	if (!entry.val) {
-		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
-		return 0;
-	}
+		if (!entry.val) {
+			memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+			return 0;
+		}
 
-	memcg = mem_cgroup_id_get_online(memcg);
+		memcg = mem_cgroup_id_get_online(memcg);
+	}
 
 	if (!mem_cgroup_is_root(memcg) &&
 	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
 		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
-		mem_cgroup_id_put(memcg);
+		if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+			mem_cgroup_id_put(memcg);
 		return -ENOMEM;
 	}
 
-	/* Get references for the tail pages, too */
-	if (nr_pages > 1)
-		mem_cgroup_id_get_many(memcg, nr_pages - 1);
+	if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP)) {
+		/* Get references for the tail pages, too */
+		if (nr_pages > 1)
+			mem_cgroup_id_get_many(memcg, nr_pages - 1);
+		swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+	}
 	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
 
-	swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
-
 	return 0;
 }
 
@@ -5079,7 +5132,11 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 	struct mem_cgroup *memcg;
 	unsigned short id;
 
-	id = swap_cgroup_clear(entry, nr_pages);
+	if (IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+		id = lookup_swap_cgroup_id(entry);
+	else
+		id = swap_cgroup_clear(entry, nr_pages);
+
 	rcu_read_lock();
 	memcg = mem_cgroup_from_id(id);
 	if (memcg) {
@@ -5090,7 +5147,8 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 				page_counter_uncharge(&memcg->swap, nr_pages);
 		}
 		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
-		mem_cgroup_id_put_many(memcg, nr_pages);
+		if (!IS_ENABLED(CONFIG_VIRTUAL_SWAP))
+			mem_cgroup_id_put_many(memcg, nr_pages);
 	}
 	rcu_read_unlock();
 }
@@ -5099,7 +5157,7 @@ static bool mem_cgroup_may_zswap(struct mem_cgroup *original_memcg);
 
 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 {
-	long nr_swap_pages, nr_zswap_pages = 0;
+	long nr_swap_pages;
 
 	/*
 	 * If swap is virtualized and zswap is enabled, we can still use zswap even
@@ -5108,10 +5166,14 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 	if (IS_ENABLED(CONFIG_VIRTUAL_SWAP) && zswap_is_enabled() &&
 			(mem_cgroup_disabled() || do_memsw_account() ||
 				mem_cgroup_may_zswap(memcg))) {
-		nr_zswap_pages = PAGE_COUNTER_MAX;
+		/*
+		 * No need to check swap cgroup limits, since zswap is not charged
+		 * towards swap consumption.
+		 */
+		return PAGE_COUNTER_MAX;
 	}
 
-	nr_swap_pages = max_t(long, nr_zswap_pages, get_nr_swap_pages());
+	nr_swap_pages = get_nr_swap_pages();
 	if (mem_cgroup_disabled() || do_memsw_account())
 		return nr_swap_pages;
 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
diff --git a/mm/vswap.c b/mm/vswap.c
index 3146c231ca69..fcc7807ba89b 100644
--- a/mm/vswap.c
+++ b/mm/vswap.c
@@ -349,6 +349,7 @@ static inline void release_backing(swp_entry_t entry, int nr)
 			swap_slot_free_nr(slot, nr);
 			swap_slot_put_swap_info(si);
 		}
+		mem_cgroup_uncharge_swap(entry, nr);
 	}
 }
 
@@ -367,7 +368,7 @@ static void vswap_free(swp_entry_t entry)
 
 	virt_clear_shadow_from_swap_cache(entry);
 	release_backing(entry, 1);
-	mem_cgroup_uncharge_swap(entry, 1);
+	mem_cgroup_unrecord_swap(entry, 1);
 	/* erase forward mapping and release the virtual slot for reallocation */
 	release_vswap_slot(entry.val);
 	kfree_rcu(desc, rcu);
@@ -392,27 +393,13 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
 {
 	swp_entry_t entry;
 	struct swp_desc *desc;
-	int i, nr = folio_nr_pages(folio);
+	int nr = folio_nr_pages(folio);
 
 	entry = vswap_alloc(nr);
 	if (!entry.val)
 		return entry;
 
-	/*
-	 * XXX: for now, we charge towards the memory cgroup's swap limit on virtual
-	 * swap slots allocation. This will be changed soon - we will only charge on
-	 * physical swap slots allocation.
-	 */
-	if (mem_cgroup_try_charge_swap(folio, entry)) {
-		for (i = 0; i < nr; i++) {
-			vswap_free(entry);
-			entry.val++;
-		}
-		atomic_add(nr, &vswap_alloc_reject);
-		entry.val = 0;
-		return entry;
-	}
-
+	mem_cgroup_record_swap(folio, entry);
 	XA_STATE(xas, &vswap_map, entry.val);
 
 	rcu_read_lock();
@@ -454,6 +441,9 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
 	if (!slot.val)
 		return slot;
 
+	if (mem_cgroup_try_charge_swap(folio, entry))
+		goto free_phys_swap;
+
 	/* establish the vrtual <-> physical swap slots linkages. */
 	for (i = 0; i < nr; i++) {
 		err = xa_insert(&vswap_rmap, slot.val + i,
@@ -462,13 +452,7 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
 		if (err) {
 			while (--i >= 0)
 				xa_erase(&vswap_rmap, slot.val + i);
-			/*
-			 * We have not updated the backing type of the virtual swap slot.
-			 * Simply free up the physical swap slots here!
-			 */
-			swap_slot_free_nr(slot, nr);
-			slot.val = 0;
-			return slot;
+			goto uncharge;
 		}
 	}
 
@@ -505,6 +489,17 @@ swp_slot_t vswap_alloc_swap_slot(struct folio *folio)
 	}
 	rcu_read_unlock();
 	return slot;
+
+uncharge:
+	mem_cgroup_uncharge_swap(entry, nr);
+free_phys_swap:
+	/*
+	 * We have not updated the backing type of the virtual swap slot.
+	 * Simply free up the physical swap slots here!
+	 */
+	swap_slot_free_nr(slot, nr);
+	slot.val = 0;
+	return slot;
 }
 
 /**
-- 
2.47.1



  parent reply	other threads:[~2025-04-07 23:42 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-07 23:42 [RFC PATCH 00/14] Virtual Swap Space Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 01/14] swapfile: rearrange functions Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 02/14] mm: swap: add an abstract API for locking out swapoff Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 03/14] mm: swap: add a separate type for physical swap slots Nhat Pham
2025-04-08 14:15   ` Johannes Weiner
2025-04-08 15:11     ` Nhat Pham
2025-04-22 14:41     ` Yosry Ahmed
     [not found]     ` <6807ab09.670a0220.152ca3.502fSMTPIN_ADDED_BROKEN@mx.google.com>
2025-04-22 15:50       ` Nhat Pham
2025-04-22 18:50         ` Kairui Song
2025-04-07 23:42 ` [RFC PATCH 04/14] mm: swap: swap cache support for virtualized swap Nhat Pham
2025-04-08 15:00   ` Johannes Weiner
2025-04-08 15:34     ` Nhat Pham
2025-04-08 15:43       ` Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 05/14] zswap: unify zswap tree " Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 06/14] mm: swap: allocate a virtual swap slot for each swapped out page Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 07/14] swap: implement the swap_cgroup API using virtual swap Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 08/14] swap: manage swap entry lifetime at the virtual swap layer Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 09/14] swap: implement locking out swapoff using virtual swap slot Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 10/14] mm: swap: decouple virtual swap slot from backing store Nhat Pham
2025-04-07 23:42 ` Nhat Pham [this message]
2025-04-07 23:42 ` [RFC PATCH 12/14] vswap: support THP swapin and batch free_swap_and_cache Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 13/14] swap: simplify swapoff using virtual swap Nhat Pham
2025-04-07 23:42 ` [RFC PATCH 14/14] zswap: do not start zswap shrinker if there is no physical swap slots Nhat Pham
2025-04-08 13:04 ` [RFC PATCH 00/14] Virtual Swap Space Usama Arif
2025-04-08 15:20   ` Nhat Pham
2025-04-08 15:45   ` Johannes Weiner
2025-04-08 16:25     ` Nhat Pham
2025-04-08 16:27       ` Nhat Pham
2025-04-08 16:22 ` Kairui Song
2025-04-08 16:47   ` Nhat Pham
2025-04-08 16:59     ` Kairui Song
2025-04-22 14:43       ` Yosry Ahmed
2025-04-22 14:56 ` Yosry Ahmed
     [not found] ` <6807afd0.a70a0220.2ae8b9.e07cSMTPIN_ADDED_BROKEN@mx.google.com>
2025-04-22 17:15   ` Nhat Pham
2025-04-22 19:29     ` Nhat Pham

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250407234223.1059191-12-nphamcs@gmail.com \
    --to=nphamcs@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=christophe.leroy@csgroup.eu \
    --cc=hannes@cmpxchg.org \
    --cc=huang.ying.caritas@gmail.com \
    --cc=hughd@google.com \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=len.brown@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=osalvador@suse.de \
    --cc=pavel@kernel.org \
    --cc=roman.gushchin@linux.dev \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yosry.ahmed@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).