From: chrisl@kernel.org
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Kairui Song <kasong@tencent.com>, Hugh Dickins <hughd@google.com>,
Ryan Roberts <ryan.roberts@arm.com>,
"Huang, Ying" <ying.huang@intel.com>,
Kalesh Singh <kaleshsingh@google.com>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Chris Li <chrisl@kernel.org>, Barry Song <baohua@kernel.org>
Subject: [PATCH v5 6/9] mm: swap: allow cache reclaim to skip slot cache
Date: Tue, 30 Jul 2024 23:49:18 -0700 [thread overview]
Message-ID: <20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org> (raw)
In-Reply-To: <20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org>
From: Kairui Song <kasong@tencent.com>
Currently we free the reclaimed slots through slot cache even
if the slot is required to be empty immediately. As a result
the reclaim caller will see the slot still occupied even after a
successful reclaim, and need to keep reclaiming until slot cache
get flushed. This caused ineffective or over reclaim when SWAP is
under stress.
So introduce a new flag allowing the slot to be emptied bypassing
the slot cache.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/swapfile.c | 152 +++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 109 insertions(+), 43 deletions(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9b63b2262cc2..4c0fc0409d3c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,8 +53,15 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
+static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
+ unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries);
+static bool folio_swapcache_freeable(struct folio *folio);
+static struct swap_cluster_info *lock_cluster_or_swap_info(
+ struct swap_info_struct *si, unsigned long offset);
+static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ struct swap_cluster_info *ci);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -129,8 +136,25 @@ static inline unsigned char swap_count(unsigned char ent)
* corresponding page
*/
#define TTRS_UNMAPPED 0x2
-/* Reclaim the swap entry if swap is getting full*/
+/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
+/* Reclaim directly, bypass the slot cache and don't touch device lock */
+#define TTRS_DIRECT 0x8
+
+static bool swap_is_has_cache(struct swap_info_struct *si,
+ unsigned long offset, int nr_pages)
+{
+ unsigned char *map = si->swap_map + offset;
+ unsigned char *map_end = map + nr_pages;
+
+ do {
+ VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
+ if (*map != SWAP_HAS_CACHE)
+ return false;
+ } while (++map < map_end);
+
+ return true;
+}
/*
* returns number of pages in the folio that backs the swap entry. If positive,
@@ -141,12 +165,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
+ struct address_space *address_space = swap_address_space(entry);
+ struct swap_cluster_info *ci;
struct folio *folio;
- int ret = 0;
+ int ret, nr_pages;
+ bool need_reclaim;
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
+ folio = filemap_get_folio(address_space, swap_cache_index(entry));
if (IS_ERR(folio))
return 0;
+
+ /* offset could point to the middle of a large folio */
+ entry = folio->swap;
+ offset = swp_offset(entry);
+ nr_pages = folio_nr_pages(folio);
+ ret = -nr_pages;
+
/*
* When this function is called from scan_swap_map_slots() and it's
* called by vmscan.c at reclaiming folios. So we hold a folio lock
@@ -154,14 +188,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* case and you should use folio_free_swap() with explicit folio_lock()
* in usual operations.
*/
- if (folio_trylock(folio)) {
- if ((flags & TTRS_ANYWAY) ||
- ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
- ret = folio_free_swap(folio);
- folio_unlock(folio);
+ if (!folio_trylock(folio))
+ goto out;
+
+ need_reclaim = ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
+ if (!need_reclaim || !folio_swapcache_freeable(folio))
+ goto out_unlock;
+
+ /*
+ * It's safe to delete the folio from swap cache only if the folio's
+ * swap_map is HAS_CACHE only, which means the slots have no page table
+ * reference or pending writeback, and can't be allocated to others.
+ */
+ ci = lock_cluster_or_swap_info(si, offset);
+ need_reclaim = swap_is_has_cache(si, offset, nr_pages);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!need_reclaim)
+ goto out_unlock;
+
+ if (!(flags & TTRS_DIRECT)) {
+ /* Free through slot cache */
+ delete_from_swap_cache(folio);
+ folio_set_dirty(folio);
+ ret = nr_pages;
+ goto out_unlock;
}
- ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
+
+ xa_lock_irq(&address_space->i_pages);
+ __delete_from_swap_cache(folio, entry, NULL);
+ xa_unlock_irq(&address_space->i_pages);
+ folio_ref_sub(folio, nr_pages);
+ folio_set_dirty(folio);
+
+ spin_lock(&si->lock);
+ /* Only sinple page folio can be backed by zswap */
+ if (!nr_pages)
+ zswap_invalidate(entry);
+ swap_entry_range_free(si, entry, nr_pages);
+ spin_unlock(&si->lock);
+ ret = nr_pages;
+out_unlock:
+ folio_unlock(folio);
+out:
folio_put(folio);
return ret;
}
@@ -903,7 +973,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed > 0)
@@ -1340,9 +1410,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
unsigned long offset = swp_offset(entry);
struct swap_cluster_info *ci;
struct swap_info_struct *si;
- unsigned char *map;
- unsigned int i, free_entries = 0;
- unsigned char val;
int size = 1 << swap_entry_order(folio_order(folio));
si = _swap_info_get(entry);
@@ -1350,23 +1417,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
return;
ci = lock_cluster_or_swap_info(si, offset);
- if (size > 1) {
- map = si->swap_map + offset;
- for (i = 0; i < size; i++) {
- val = map[i];
- VM_BUG_ON(!(val & SWAP_HAS_CACHE));
- if (val == SWAP_HAS_CACHE)
- free_entries++;
- }
- if (free_entries == size) {
- unlock_cluster_or_swap_info(si, ci);
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, size);
- spin_unlock(&si->lock);
- return;
- }
+ if (size > 1 && swap_is_has_cache(si, offset, size)) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ swap_entry_range_free(si, entry, size);
+ spin_unlock(&si->lock);
+ return;
}
- for (i = 0; i < size; i++, entry.val++) {
+ for (int i = 0; i < size; i++, entry.val++) {
if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
unlock_cluster_or_swap_info(si, ci);
free_swap_slot(entry);
@@ -1526,16 +1584,7 @@ static bool folio_swapped(struct folio *folio)
return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
-/**
- * folio_free_swap() - Free the swap space used for this folio.
- * @folio: The folio to remove.
- *
- * If swap is getting full, or if there are no more mappings of this folio,
- * then call folio_free_swap to free its swap space.
- *
- * Return: true if we were able to release the swap space.
- */
-bool folio_free_swap(struct folio *folio)
+static bool folio_swapcache_freeable(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -1543,8 +1592,6 @@ bool folio_free_swap(struct folio *folio)
return false;
if (folio_test_writeback(folio))
return false;
- if (folio_swapped(folio))
- return false;
/*
* Once hibernation has begun to create its image of memory,
@@ -1564,6 +1611,25 @@ bool folio_free_swap(struct folio *folio)
if (pm_suspended_storage())
return false;
+ return true;
+}
+
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
+ *
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
+ */
+bool folio_free_swap(struct folio *folio)
+{
+ if (!folio_swapcache_freeable(folio))
+ return false;
+ if (folio_swapped(folio))
+ return false;
+
delete_from_swap_cache(folio);
folio_set_dirty(folio);
return true;
@@ -1640,7 +1706,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
* to the next boundary.
*/
nr = __try_to_reclaim_swap(si, offset,
- TTRS_UNMAPPED | TTRS_FULL);
+ TTRS_UNMAPPED | TTRS_FULL);
if (nr == 0)
nr = 1;
else if (nr < 0)
--
2.46.0.rc1.232.g9752f9e123-goog
next prev parent reply other threads:[~2024-07-31 6:49 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-31 6:49 [PATCH v5 0/9] mm: swap: mTHP swap allocator base on swap cluster order Chris Li
2024-07-31 6:49 ` [PATCH v5 1/9] mm: swap: swap cluster switch to double link list Chris Li
2024-07-31 6:49 ` [PATCH v5 2/9] mm: swap: mTHP allocate swap entries from nonfull list Chris Li
[not found] ` <87bk23250r.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16 8:01 ` Chris Li
2024-08-19 8:08 ` Huang, Ying
2024-08-26 21:26 ` Chris Li
2024-09-09 7:19 ` Huang, Ying
2024-07-31 6:49 ` [PATCH v5 3/9] mm: swap: separate SSD allocation from scan_swap_map_slots() Chris Li
2024-07-31 6:49 ` [PATCH v5 4/9] mm: swap: clean up initialization helper chrisl
2024-07-31 6:49 ` [PATCH v5 5/9] mm: swap: skip slot cache on freeing for mTHP chrisl
2024-08-03 9:11 ` Barry Song
2024-08-03 10:57 ` Barry Song
2024-07-31 6:49 ` chrisl [this message]
2024-08-03 10:38 ` [PATCH v5 6/9] mm: swap: allow cache reclaim to skip slot cache Barry Song
2024-08-03 12:18 ` Kairui Song
2024-08-04 18:06 ` Chris Li
2024-08-05 1:53 ` Barry Song
2024-07-31 6:49 ` [PATCH v5 7/9] mm: swap: add a fragment cluster list chrisl
2024-07-31 6:49 ` [PATCH v5 8/9] mm: swap: relaim the cached parts that got scanned chrisl
2024-07-31 6:49 ` [PATCH v5 9/9] mm: swap: add a adaptive full cluster cache reclaim chrisl
2024-08-01 9:14 ` [PATCH v5 0/9] mm: swap: mTHP swap allocator base on swap cluster order David Hildenbrand
2024-08-01 9:59 ` Kairui Song
2024-08-01 10:06 ` Kairui Song
[not found] ` <87le17z9zr.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16 7:36 ` Chris Li
2024-08-17 17:47 ` Kairui Song
[not found] ` <87h6bw3gxl.fsf@yhuang6-desk2.ccr.corp.intel.com>
[not found] ` <CACePvbXH8b9SOePQ-Ld_UBbcAdJ3gdYtEkReMto5Hbq9WAL7JQ@mail.gmail.com>
[not found] ` <87sevfza3w.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16 7:47 ` Chris Li
2024-08-18 16:59 ` Kairui Song
2024-08-19 8:27 ` Huang, Ying
2024-08-19 8:47 ` Kairui Song
2024-08-19 21:27 ` Chris Li
2024-08-19 8:39 ` Huang, Ying
2024-09-02 1:20 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org \
--to=chrisl@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=hughd@google.com \
--cc=kaleshsingh@google.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ryan.roberts@arm.com \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).