From: Nhat Pham <nphamcs@gmail.com>
To: kasong@tencent.com
Cc: Liam.Howlett@oracle.com, akpm@linux-foundation.org,
apopple@nvidia.com, axelrasmussen@google.com, baohua@kernel.org,
baolin.wang@linux.alibaba.com, bhe@redhat.com, byungchul@sk.com,
cgroups@vger.kernel.org, chengming.zhou@linux.dev,
chrisl@kernel.org, corbet@lwn.net, david@kernel.org,
dev.jain@arm.com, gourry@gourry.net, hannes@cmpxchg.org,
hughd@google.com, jannh@google.com, joshua.hahnjy@gmail.com,
lance.yang@linux.dev, lenb@kernel.org, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-pm@vger.kernel.org, lorenzo.stoakes@oracle.com,
matthew.brost@intel.com, mhocko@suse.com, muchun.song@linux.dev,
npache@redhat.com, nphamcs@gmail.com, pavel@kernel.org,
peterx@redhat.com, peterz@infradead.org, pfalcato@suse.de,
rafael@kernel.org, rakie.kim@sk.com, roman.gushchin@linux.dev,
rppt@kernel.org, ryan.roberts@arm.com, shakeel.butt@linux.dev,
shikemeng@huaweicloud.com, surenb@google.com, tglx@kernel.org,
vbabka@suse.cz, weixugc@google.com, ying.huang@linux.alibaba.com,
yosry.ahmed@linux.dev, yuanchu@google.com,
zhengqi.arch@bytedance.com, ziy@nvidia.com, kernel-team@meta.com,
riel@surriel.com
Subject: [PATCH v5 20/21] swapfile: replace the swap map with bitmaps
Date: Fri, 20 Mar 2026 12:27:34 -0700 [thread overview]
Message-ID: <20260320192735.748051-21-nphamcs@gmail.com> (raw)
In-Reply-To: <20260320192735.748051-1-nphamcs@gmail.com>
Now that we have moved the swap count state to virtual swap layer, each
swap map entry only has 3 possible states: free, allocated, and bad.
Replace the swap map with 2 bitmaps (one for allocated state and one for
bad state), saving 6 bits per swap entry.
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
include/linux/swap.h | 3 +-
mm/swapfile.c | 81 +++++++++++++++++++++++---------------------
2 files changed, 44 insertions(+), 40 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 21e528d8d3480..3c789149996c5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,7 +259,8 @@ struct swap_info_struct {
struct plist_node list; /* entry in swap_active_head */
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
- unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ unsigned long *swap_map; /* bitmap for allocated state */
+ unsigned long *bad_map; /* bitmap for bad state */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
struct list_head free_clusters; /* free clusters list */
struct list_head full_clusters; /* full clusters list */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b553652125d11..3e2bfcf1aa789 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -760,25 +760,19 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned long start, unsigned long end)
{
- unsigned char *map = si->swap_map;
unsigned long offset = start;
int nr_reclaim;
spin_unlock(&ci->lock);
do {
- switch (READ_ONCE(map[offset])) {
- case 0:
+ if (!test_bit(offset, si->swap_map)) {
offset++;
- break;
- case SWAP_MAP_ALLOCATED:
+ } else {
nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
if (nr_reclaim > 0)
offset += nr_reclaim;
else
goto out;
- break;
- default:
- goto out;
}
} while (offset < end);
out:
@@ -787,11 +781,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
* Recheck the range no matter reclaim succeeded or not, the slot
* could have been be freed while we are not holding the lock.
*/
- for (offset = start; offset < end; offset++)
- if (READ_ONCE(map[offset]))
- return false;
-
- return true;
+ return find_next_bit(si->swap_map, end, start) >= end;
}
static bool cluster_scan_range(struct swap_info_struct *si,
@@ -800,15 +790,16 @@ static bool cluster_scan_range(struct swap_info_struct *si,
bool *need_reclaim)
{
unsigned long offset, end = start + nr_pages;
- unsigned char *map = si->swap_map;
- unsigned char count;
if (cluster_is_empty(ci))
return true;
for (offset = start; offset < end; offset++) {
- count = READ_ONCE(map[offset]);
- if (!count)
+ /* Bad slots cannot be used for allocation */
+ if (test_bit(offset, si->bad_map))
+ return false;
+
+ if (!test_bit(offset, si->swap_map))
continue;
if (swap_cache_only(si, offset)) {
@@ -841,7 +832,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
if (cluster_is_empty(ci))
ci->order = order;
- memset(si->swap_map + start, usage, nr_pages);
+ bitmap_set(si->swap_map, start, nr_pages);
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
@@ -1407,7 +1398,7 @@ static struct swap_info_struct *_swap_info_get(swp_slot_t slot)
offset = swp_slot_offset(slot);
if (offset >= si->max)
goto bad_offset;
- if (data_race(!si->swap_map[swp_slot_offset(slot)]))
+ if (data_race(!test_bit(offset, si->swap_map)))
goto bad_free;
return si;
@@ -1521,8 +1512,7 @@ static void swap_slots_free(struct swap_info_struct *si,
swp_slot_t slot, unsigned int nr_pages)
{
unsigned long offset = swp_slot_offset(slot);
- unsigned char *map = si->swap_map + offset;
- unsigned char *map_end = map + nr_pages;
+ unsigned long end = offset + nr_pages;
/* It should never free entries across different clusters */
VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
@@ -1530,10 +1520,8 @@ static void swap_slots_free(struct swap_info_struct *si,
VM_BUG_ON(ci->count < nr_pages);
ci->count -= nr_pages;
- do {
- VM_BUG_ON(!swap_is_last_ref(*map));
- *map = 0;
- } while (++map < map_end);
+ VM_BUG_ON(find_next_zero_bit(si->swap_map, end, offset) < end);
+ bitmap_clear(si->swap_map, offset, nr_pages);
swap_range_free(si, offset, nr_pages);
@@ -1744,9 +1732,7 @@ unsigned int count_swap_pages(int type, int free)
static bool swap_slot_allocated(struct swap_info_struct *si,
unsigned long offset)
{
- unsigned char count = READ_ONCE(si->swap_map[offset]);
-
- return count && swap_count(count) != SWAP_MAP_BAD;
+ return test_bit(offset, si->swap_map);
}
/*
@@ -2067,7 +2053,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
}
static void setup_swap_info(struct swap_info_struct *si, int prio,
- unsigned char *swap_map,
+ unsigned long *swap_map,
struct swap_cluster_info *cluster_info)
{
si->prio = prio;
@@ -2095,7 +2081,7 @@ static void _enable_swap_info(struct swap_info_struct *si)
}
static void enable_swap_info(struct swap_info_struct *si, int prio,
- unsigned char *swap_map,
+ unsigned long *swap_map,
struct swap_cluster_info *cluster_info)
{
spin_lock(&swap_lock);
@@ -2188,7 +2174,8 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
- unsigned char *swap_map;
+ unsigned long *swap_map;
+ unsigned long *bad_map;
struct swap_cluster_info *cluster_info;
struct file *swap_file, *victim;
struct address_space *mapping;
@@ -2283,6 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->swap_file = NULL;
swap_map = p->swap_map;
p->swap_map = NULL;
+ bad_map = p->bad_map;
+ p->bad_map = NULL;
maxpages = p->max;
cluster_info = p->cluster_info;
p->max = 0;
@@ -2293,7 +2282,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
kfree(p->global_cluster);
p->global_cluster = NULL;
- vfree(swap_map);
+ kvfree(swap_map);
+ kvfree(bad_map);
free_cluster_info(cluster_info, maxpages);
inode = mapping->host;
@@ -2641,18 +2631,20 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
static int setup_swap_map(struct swap_info_struct *si,
union swap_header *swap_header,
- unsigned char *swap_map,
+ unsigned long *swap_map,
+ unsigned long *bad_map,
unsigned long maxpages)
{
unsigned long i;
- swap_map[0] = SWAP_MAP_BAD; /* omit header page */
+ set_bit(0, bad_map); /* omit header page */
+
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
- swap_map[page_nr] = SWAP_MAP_BAD;
+ set_bit(page_nr, bad_map);
si->pages--;
}
}
@@ -2756,7 +2748,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
int nr_extents;
sector_t span;
unsigned long maxpages;
- unsigned char *swap_map = NULL;
+ unsigned long *swap_map = NULL, *bad_map = NULL;
struct swap_cluster_info *cluster_info = NULL;
struct folio *folio = NULL;
struct inode *inode = NULL;
@@ -2852,16 +2844,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
maxpages = si->max;
/* OK, set up the swap map and apply the bad block list */
- swap_map = vzalloc(maxpages);
+ swap_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL);
if (!swap_map) {
error = -ENOMEM;
goto bad_swap_unlock_inode;
}
- error = setup_swap_map(si, swap_header, swap_map, maxpages);
+ bad_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL);
+ if (!bad_map) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
+
+ error = setup_swap_map(si, swap_header, swap_map, bad_map, maxpages);
if (error)
goto bad_swap_unlock_inode;
+ si->bad_map = bad_map;
+
if (si->bdev && bdev_stable_writes(si->bdev))
si->flags |= SWP_STABLE_WRITES;
@@ -2955,7 +2955,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
si->swap_file = NULL;
si->flags = 0;
spin_unlock(&swap_lock);
- vfree(swap_map);
+ if (swap_map)
+ kvfree(swap_map);
+ if (bad_map)
+ kvfree(bad_map);
if (cluster_info)
free_cluster_info(cluster_info, maxpages);
if (inced_nr_rotate_swap)
--
2.52.0
next prev parent reply other threads:[~2026-03-20 19:28 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-20 19:27 [PATCH v5 00/21] Virtual Swap Space Nhat Pham
2026-03-20 19:27 ` [PATCH v5 01/21] mm/swap: decouple swap cache from physical swap infrastructure Nhat Pham
2026-03-20 19:27 ` [PATCH v5 02/21] swap: rearrange the swap header file Nhat Pham
2026-03-20 19:27 ` [PATCH v5 03/21] mm: swap: add an abstract API for locking out swapoff Nhat Pham
2026-03-20 19:27 ` [PATCH v5 04/21] zswap: add new helpers for zswap entry operations Nhat Pham
2026-03-20 19:27 ` [PATCH v5 05/21] mm/swap: add a new function to check if a swap entry is in swap cached Nhat Pham
2026-03-20 19:27 ` [PATCH v5 06/21] mm: swap: add a separate type for physical swap slots Nhat Pham
2026-03-20 19:27 ` [PATCH v5 07/21] mm: create scaffolds for the new virtual swap implementation Nhat Pham
2026-03-20 19:27 ` [PATCH v5 08/21] zswap: prepare zswap for swap virtualization Nhat Pham
2026-03-20 19:27 ` [PATCH v5 09/21] mm: swap: allocate a virtual swap slot for each swapped out page Nhat Pham
2026-03-20 19:27 ` [PATCH v5 10/21] swap: move swap cache to virtual swap descriptor Nhat Pham
2026-03-20 19:27 ` [PATCH v5 11/21] zswap: move zswap entry management to the " Nhat Pham
2026-03-20 19:27 ` [PATCH v5 12/21] swap: implement the swap_cgroup API using virtual swap Nhat Pham
2026-03-20 19:27 ` [PATCH v5 13/21] swap: manage swap entry lifecycle at the virtual swap layer Nhat Pham
2026-03-20 19:27 ` [PATCH v5 14/21] mm: swap: decouple virtual swap slot from backing store Nhat Pham
2026-03-20 19:27 ` [PATCH v5 15/21] zswap: do not start zswap shrinker if there is no physical swap slots Nhat Pham
2026-03-20 19:27 ` [PATCH v5 16/21] swap: do not unnecesarily pin readahead swap entries Nhat Pham
2026-03-20 19:27 ` [PATCH v5 17/21] swapfile: remove zeromap bitmap Nhat Pham
2026-03-20 19:27 ` [PATCH v5 18/21] memcg: swap: only charge physical swap slots Nhat Pham
2026-03-20 19:27 ` [PATCH v5 19/21] swap: simplify swapoff using virtual swap Nhat Pham
2026-03-20 19:27 ` Nhat Pham [this message]
2026-03-20 19:27 ` [PATCH v5 21/21] vswap: batch contiguous vswap free calls Nhat Pham
2026-03-21 18:22 ` [PATCH v5 00/21] Virtual Swap Space Andrew Morton
2026-03-22 2:18 ` Roman Gushchin
[not found] ` <CAMgjq7AiUr_Ntj51qoqvV+=XbEATjr7S4MH+rgD32T5pHfF7mg@mail.gmail.com>
2026-03-23 15:32 ` Nhat Pham
2026-03-23 16:40 ` Kairui Song
2026-03-23 20:05 ` Nhat Pham
2026-03-25 18:53 ` YoungJun Park
2026-03-24 13:19 ` Askar Safin
2026-03-24 17:23 ` Nhat Pham
2026-03-25 2:35 ` Askar Safin
2026-03-25 18:36 ` YoungJun Park
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260320192735.748051-21-nphamcs@gmail.com \
--to=nphamcs@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=jannh@google.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=lenb@kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-pm@vger.kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=matthew.brost@intel.com \
--cc=mhocko@suse.com \
--cc=muchun.song@linux.dev \
--cc=npache@redhat.com \
--cc=pavel@kernel.org \
--cc=peterx@redhat.com \
--cc=peterz@infradead.org \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=surenb@google.com \
--cc=tglx@kernel.org \
--cc=vbabka@suse.cz \
--cc=weixugc@google.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yosry.ahmed@linux.dev \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox