From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
Matthew Wilcox <willy@infradead.org>,
Hugh Dickins <hughd@google.com>, Chris Li <chrisl@kernel.org>,
David Hildenbrand <david@redhat.com>,
Yosry Ahmed <yosryahmed@google.com>,
"Huang, Ying" <ying.huang@linux.alibaba.com>,
Nhat Pham <nphamcs@gmail.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Baolin Wang <baolin.wang@linux.alibaba.com>,
Baoquan He <bhe@redhat.com>, Barry Song <baohua@kernel.org>,
Kalesh Singh <kaleshsingh@google.com>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Ryan Roberts <ryan.roberts@arm.com>,
linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [PATCH 25/28] mm/workingset: leave highest 8 bits empty for anon shadow
Date: Thu, 15 May 2025 04:17:25 +0800 [thread overview]
Message-ID: <20250514201729.48420-26-ryncsn@gmail.com> (raw)
In-Reply-To: <20250514201729.48420-1-ryncsn@gmail.com>
From: Kairui Song <kasong@tencent.com>
Swap table entry will need 8 bits reserved for swap count, so anon
shadow should have 8 bits remain 0.
This should be OK for foreseeable future, take 52 bits physical address
space as example: for 4K pages, there would be at most 40 bits for
addressable pages. Currently we have 36 bits available (with NODES_SHIFT
set to 10, this can be decreased for more bits), so in worst case
refault distance compare will be done for every 64K sized bucket.
This commit may increases the bucket size to 16M, which should be fine
as the workingset size will be way larger than the bucket size for such
large machines.
For MGLRU 28 bits can track a huge amount of gens already, there should
be no problem either.
And the 8 bits can be changed to 6 or even fewer bits later.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/swap_table.h | 1 +
mm/workingset.c | 39 ++++++++++++++++++++++++++-------------
2 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/mm/swap_table.h b/mm/swap_table.h
index 9356004d211a..afb2953d408a 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -65,6 +65,7 @@ static inline swp_te_t shadow_swp_te(void *shadow)
BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(swp_te_t));
BUILD_BUG_ON((unsigned long)xa_mk_value(0) != ENTRY_SHADOW_MARK);
VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
+ VM_WARN_ON((unsigned long)shadow & ENTRY_COUNT_MASK);
swp_te.counter |= ENTRY_SHADOW_MARK;
return swp_te;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 6e7f4cb1b9a7..86a549a17ae1 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "swap_table.h"
#include "internal.h"
/*
@@ -184,7 +185,9 @@
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
WORKINGSET_SHIFT + NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT)
+#define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+#define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON)
/*
* Eviction timestamps need to be able to cover the full range of
@@ -194,12 +197,16 @@
* that case, we have to sacrifice granularity for distance, and group
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
-static unsigned int bucket_order __read_mostly;
+static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
- bool workingset)
+ bool workingset, bool file)
{
- eviction &= EVICTION_MASK;
+ if (file)
+ eviction &= EVICTION_MASK;
+ else
+ eviction &= EVICTION_MASK_ANON;
+
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << WORKINGSET_SHIFT) | workingset;
@@ -244,7 +251,8 @@ static void *lru_gen_eviction(struct folio *folio)
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
- BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
+ BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
@@ -254,7 +262,7 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset, type);
}
/*
@@ -381,6 +389,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ int file = folio_is_file_lru(folio);
unsigned long eviction;
struct lruvec *lruvec;
int memcgid;
@@ -397,10 +406,10 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
- eviction >>= bucket_order;
+ eviction >>= bucket_order[file];
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
return pack_shadow(memcgid, pgdat, eviction,
- folio_test_workingset(folio));
+ folio_test_workingset(folio), folio_is_file_lru(folio));
}
/**
@@ -438,7 +447,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
rcu_read_lock();
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
- eviction <<= bucket_order;
+ eviction <<= bucket_order[file];
/*
* Look up the memcg associated with the stored ID. It might
@@ -780,8 +789,8 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits, timestamp_bits_anon;
struct shrinker *workingset_shadow_shrinker;
- unsigned int timestamp_bits;
unsigned int max_order;
int ret = -ENOMEM;
@@ -794,11 +803,15 @@ static int __init workingset_init(void)
* double the initial memory by using totalram_pages as-is.
*/
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
max_order = fls_long(totalram_pages() - 1);
- if (max_order > timestamp_bits)
- bucket_order = max_order - timestamp_bits;
- pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
- timestamp_bits, max_order, bucket_order);
+ if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
+ bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
+ if (max_order > timestamp_bits_anon)
+ bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
+ pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
+ timestamp_bits, timestamp_bits_anon, max_order,
+ bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE,
--
2.49.0
next prev parent reply other threads:[~2025-05-14 20:19 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-14 20:17 [PATCH 00/28] mm, swap: introduce swap table Kairui Song
2025-05-14 20:17 ` [PATCH 01/28] mm, swap: don't scan every fragment cluster Kairui Song
2025-05-14 20:17 ` [PATCH 02/28] mm, swap: consolidate the helper for mincore Kairui Song
2025-05-14 20:17 ` [PATCH 03/28] mm/shmem, swap: remove SWAP_MAP_SHMEM Kairui Song
2025-05-14 20:17 ` [PATCH 04/28] mm, swap: split readahead update out of swap cache lookup Kairui Song
2025-05-14 20:17 ` [PATCH 05/28] mm, swap: sanitize swap cache lookup convention Kairui Song
2025-05-19 4:38 ` Barry Song
2025-05-20 3:31 ` Kairui Song
2025-05-20 4:41 ` Barry Song
2025-05-20 19:09 ` Kairui Song
2025-05-20 22:33 ` Barry Song
2025-05-21 2:45 ` Kairui Song
2025-05-21 3:24 ` Barry Song
2025-05-23 2:29 ` Barry Song
2025-05-23 20:01 ` Kairui Song
2025-05-27 7:58 ` Barry Song
2025-05-27 15:11 ` Kairui Song
2025-05-30 8:49 ` Kairui Song
2025-05-30 19:24 ` Kairui Song
2025-05-14 20:17 ` [PATCH 06/28] mm, swap: rearrange swap cluster definition and helpers Kairui Song
2025-05-19 6:26 ` Barry Song
2025-05-20 3:50 ` Kairui Song
2025-05-14 20:17 ` [PATCH 07/28] mm, swap: tidy up swap device and cluster info helpers Kairui Song
2025-05-14 20:17 ` [PATCH 08/28] mm, swap: use swap table for the swap cache and switch API Kairui Song
2025-05-14 20:17 ` [PATCH 09/28] mm/swap: rename __read_swap_cache_async to __swapin_cache_alloc Kairui Song
2025-05-14 20:17 ` [PATCH 10/28] mm, swap: add a swap helper for bypassing only read ahead Kairui Song
2025-05-14 20:17 ` [PATCH 11/28] mm, swap: clean up and consolidate helper for mTHP swapin check Kairui Song
2025-05-15 9:31 ` Klara Modin
2025-05-15 9:39 ` Kairui Song
2025-05-19 7:08 ` Barry Song
2025-05-19 11:09 ` Kairui Song
2025-05-19 11:57 ` Barry Song
2025-05-14 20:17 ` [PATCH 12/28] mm, swap: never bypass the swap cache for SWP_SYNCHRONOUS_IO Kairui Song
2025-05-14 20:17 ` [PATCH 13/28] mm/shmem, swap: avoid redundant Xarray lookup during swapin Kairui Song
2025-05-14 20:17 ` [PATCH 14/28] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO Kairui Song
2025-05-14 20:17 ` [PATCH 15/28] mm, swap: split locked entry freeing into a standalone helper Kairui Song
2025-05-14 20:17 ` [PATCH 16/28] mm, swap: use swap cache as the swap in synchronize layer Kairui Song
2025-05-14 20:17 ` [PATCH 17/28] mm, swap: sanitize swap entry management workflow Kairui Song
2025-05-14 20:17 ` [PATCH 18/28] mm, swap: rename and introduce folio_free_swap_cache Kairui Song
2025-05-14 20:17 ` [PATCH 19/28] mm, swap: clean up and improve swap entries batch freeing Kairui Song
2025-05-14 20:17 ` [PATCH 20/28] mm, swap: check swap table directly for checking cache Kairui Song
2025-06-19 10:38 ` Baoquan He
2025-06-19 10:50 ` Kairui Song
2025-06-20 8:04 ` Baoquan He
2025-05-14 20:17 ` [PATCH 21/28] mm, swap: add folio to swap cache directly on allocation Kairui Song
2025-05-14 20:17 ` [PATCH 22/28] mm, swap: drop the SWAP_HAS_CACHE flag Kairui Song
2025-05-14 20:17 ` [PATCH 23/28] mm, swap: remove no longer needed _swap_info_get Kairui Song
2025-05-14 20:17 ` [PATCH 24/28] mm, swap: implement helpers for reserving data in swap table Kairui Song
2025-05-15 9:40 ` Klara Modin
2025-05-16 2:35 ` Kairui Song
2025-05-14 20:17 ` Kairui Song [this message]
2025-05-14 20:17 ` [PATCH 26/28] mm, swap: minor clean up for swapon Kairui Song
2025-05-14 20:17 ` [PATCH 27/28] mm, swap: use swap table to track swap count Kairui Song
2025-05-14 20:17 ` [PATCH 28/28] mm, swap: implement dynamic allocation of swap table Kairui Song
2025-05-21 18:36 ` Nhat Pham
2025-05-22 4:13 ` Kairui Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250514201729.48420-26-ryncsn@gmail.com \
--to=ryncsn@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=david@redhat.com \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=kaleshsingh@google.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nphamcs@gmail.com \
--cc=ryan.roberts@arm.com \
--cc=shikemeng@huaweicloud.com \
--cc=tim.c.chen@linux.intel.com \
--cc=willy@infradead.org \
--cc=ying.huang@linux.alibaba.com \
--cc=yosryahmed@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).