From: Youngjun Park <youngjun.park@lge.com>
To: akpm@linux-foundation.org, hannes@cmpxchg.org
Cc: mhocko@kernel.org, roman.gushchin@linux.dev,
shakeel.butt@linux.dev, muchun.song@linux.dev,
shikemeng@huaweicloud.com, kasong@tencent.com, nphamcs@gmail.com,
bhe@redhat.com, baohua@kernel.org, chrisl@kernel.org,
cgroups@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, gunho.lee@lge.com,
iamjoonsoo.kim@lge.com, taejoon.song@lge.com,
Youngjun Park <youngjun.park@lge.com>
Subject: [PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters
Date: Thu, 17 Jul 2025 05:20:06 +0900 [thread overview]
Message-ID: <20250716202006.3640584-5-youngjun.park@lge.com> (raw)
In-Reply-To: <20250716202006.3640584-1-youngjun.park@lge.com>
This patch introduces a new swap allocation mechanism that supports
per-cgroup per-CPU swap device caches, combined with per-device per-CPU
cluster management.
The existing global swap allocator uses a per-CPU device cache and
cluster, shared by all cgroups. Under this model, per-cgroup swap
priorities cannot be effectively honored on the fast path, as allocations
do not distinguish between cgroups.
To address this, we introduce per-cgroup per-CPU swap device caches.
This allows fast-path swap allocations to respect each cgroup’s
individual priority settings.
To avoid an explosion of cluster structures proportional to the number
of cgroups, clusters remain per-device and are shared across cgroups.
This strikes a balance between performance and memory overhead.
Suggested-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
include/linux/swap.h | 7 ++
mm/swap_cgroup_priority.c | 156 +++++++++++++++++++++++++++++++++++++-
mm/swap_cgroup_priority.h | 39 ++++++++++
mm/swapfile.c | 47 +++++++-----
4 files changed, 228 insertions(+), 21 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bfddbec2ee28..ab15f4c103a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -283,6 +283,12 @@ enum swap_cluster_flags {
#define SWAP_NR_ORDERS 1
#endif
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+struct percpu_cluster {
+ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+};
+#endif
+
/*
* We keep using same cluster for rotational device so IO will be sequential.
* The purpose is to optimize SWAP throughput on these device.
@@ -341,6 +347,7 @@ struct swap_info_struct {
struct list_head discard_clusters; /* discard clusters list */
#ifdef CONFIG_SWAP_CGROUP_PRIORITY
u64 id;
+ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
#endif
struct plist_node avail_lists[]; /*
* entries in swap_avail_heads, one
diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
index 84e876b77f01..f960c3dcab48 100644
--- a/mm/swap_cgroup_priority.c
+++ b/mm/swap_cgroup_priority.c
@@ -21,6 +21,17 @@
#include "swap_cgroup_priority.h"
#include "memcontrol-v1.h"
+/*
+ * We do maintain a cache on a per-cgroup-per-swap-device basis.
+ * However, the underlying cluster cache itself is managed
+ * per-swap-device. This design prevents each individual
+ * swap_cgroup_priority entry from caching its own cluster data,
+ * even as the number of such entries increases.
+ */
+struct percpu_swap_device {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+};
+
static DEFINE_MUTEX(swap_cgroup_priority_inherit_lck);
static LIST_HEAD(swap_cgroup_priority_list);
@@ -49,6 +60,7 @@ static LIST_HEAD(swap_cgroup_priority_list);
* least_priority - Current lowest priority.
* distance - Priority differences from global swap priority.
* default_prio - Default priority for this cgroup.
+ * pcpu_swapdev - Per-CPU swap device.
* plist - Priority list head.
*/
struct swap_cgroup_priority {
@@ -64,6 +76,7 @@ struct swap_cgroup_priority {
int least_priority;
s8 distance;
int default_prio;
+ struct percpu_swap_device __percpu *pcpu_swapdev;
struct plist_head plist[];
};
@@ -132,6 +145,21 @@ static struct swap_cgroup_priority *get_effective_swap_cgroup_priority(
return swap_priority->effective;
}
+static struct swap_cgroup_priority *get_effective_swap_cgroup_priority_rcu(
+ struct mem_cgroup *memcg)
+{
+ struct swap_cgroup_priority *swap_priority;
+
+ if (!memcg)
+ return NULL;
+
+ swap_priority = rcu_dereference(memcg->swap_priority);
+ if (!swap_priority)
+ return NULL;
+
+ return rcu_dereference(swap_priority->effective);
+}
+
static bool validate_effective_swap_cgroup_priority(
struct mem_cgroup *memcg,
struct swap_cgroup_priority **swap_priority)
@@ -172,6 +200,9 @@ static void free_swap_cgroup_priority_pnode(
static void free_swap_cgroup_priority(
struct swap_cgroup_priority *swap_priority)
{
+ if (swap_priority->pcpu_swapdev)
+ free_percpu(swap_priority->pcpu_swapdev);
+
for (int i = 0; i < MAX_SWAPFILES; i++)
free_swap_cgroup_priority_pnode(swap_priority->pnode[i]);
@@ -187,6 +218,12 @@ static struct swap_cgroup_priority *alloc_swap_cgroup_priority(void)
if (!swap_priority)
return NULL;
+ swap_priority->pcpu_swapdev = alloc_percpu(struct percpu_swap_device);
+ if (!swap_priority->pcpu_swapdev) {
+ kvfree(swap_priority);
+ return NULL;
+ }
+
/*
* Pre-allocates pnode array up to nr_swapfiles at init.
* Individual pnodes are assigned on swapon, but not freed
@@ -326,10 +363,34 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
unsigned long offset;
int node;
- /*
- * TODO: Per-cpu swap cluster cache can't be used directly
- * as cgroup-specific priorities may select different devices.
- */
+ rcu_read_lock();
+ if (!(swap_priority = get_effective_swap_cgroup_priority_rcu(memcg))) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ /* Fast path */
+ si = this_cpu_read(swap_priority->pcpu_swapdev->si[order]);
+ if (si && get_swap_device_info(si)) {
+ offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+ if (offset) {
+ *entry = swp_entry(si->type, offset);
+ /*
+ * Protected by 'percpu_swap_cluster' local_lock;
+ * CPU migration is disabled during this operation.
+ */
+ this_cpu_write(swap_priority->pcpu_swapdev->si[order],
+ si);
+ put_swap_device(si);
+ rcu_read_unlock();
+
+ return true;
+ }
+ put_swap_device(si);
+ }
+ rcu_read_unlock();
+
+ /* Slow path */
spin_lock(&swap_avail_lock);
node = numa_node_id();
@@ -350,6 +411,14 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order,
SWAP_HAS_CACHE);
+ /*
+ * Protected by 'percpu_swap_cluster' local_lock;
+ * CPU migration is disabled during this operation.
+ */
+ if (memcg->swap_priority == swap_priority)
+ this_cpu_write(
+ swap_priority->pcpu_swapdev->si[order],
+ si);
put_swap_device(si);
if (offset) {
*entry = swp_entry(si->type, offset);
@@ -687,6 +756,21 @@ static int __apply_swap_cgroup_priority(
return 0;
}
+static int init_swap_cgroup_priority_pcpu_swapdev_cache(
+ struct swap_cgroup_priority *swap_priority)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct percpu_swap_device *pcp_swap_dev =
+ per_cpu_ptr(swap_priority->pcpu_swapdev, cpu);
+ for (int i = 0; i < SWAP_NR_ORDERS; i++)
+ pcp_swap_dev->si[i] = NULL;
+ }
+
+ return 0;
+}
+
/*
* If this is the top-level swap_cgroup_priority, propagation is needed.
* We traverse the 'mem_cgroup_tree' using 'for_each_mem_cgroup_tree'.
@@ -795,6 +879,8 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
for_each_node(nid)
plist_head_init(&swap_priority->plist[nid]);
+ init_swap_cgroup_priority_pcpu_swapdev_cache(swap_priority);
+
prio_set:
spin_lock(&swap_lock);
spin_lock(&swap_avail_lock);
@@ -843,6 +929,23 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
spin_unlock(&swap_avail_lock);
spin_unlock(&swap_lock);
+ /*
+ * XXX: We cannot fully synchronize with swap_alloc_cgroup_priority
+ * when updating the next si.
+ * Still, we ensure that flush operations inside swap_priority
+ * are performed as reliably as possible.
+ */
+ if (id != DEFAULT_ID &&
+ swap_priority == swap_priority->effective && !new) {
+ int cpu;
+ struct swap_info_struct **pcp_si;
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(
+ swap_priority->pcpu_swapdev->si, cpu);
+ for (int i = 0; i < SWAP_NR_ORDERS; i++)
+ pcp_si[i] = NULL;
+ }
+ }
mutex_unlock(&swap_cgroup_priority_inherit_lck);
return 0;
@@ -886,3 +989,48 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
spin_unlock(&swap_avail_lock);
mutex_unlock(&swap_cgroup_priority_inherit_lck);
}
+
+void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si)
+{
+ int cpu, i;
+ struct swap_info_struct **pcp_si;
+ struct swap_cgroup_priority *swap_priority;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(swap_priority,
+ &swap_cgroup_priority_list, link) {
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(
+ swap_priority->pcpu_swapdev->si, cpu);
+
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cmpxchg(&pcp_si[i], si, NULL);
+ }
+ }
+ rcu_read_unlock();
+}
+
+bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
+{
+ si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!si->percpu_cluster)
+ return false;
+
+ int cpu;
+ int i;
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
+
+ cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_ENTRY_INVALID;
+ }
+
+ return true;
+}
+
+void free_percpu_swap_cluster(struct swap_info_struct *si)
+{
+ free_percpu(si->percpu_cluster);
+ si->percpu_cluster = NULL;
+}
diff --git a/mm/swap_cgroup_priority.h b/mm/swap_cgroup_priority.h
index 5d16b63d12e0..815822ebd0d1 100644
--- a/mm/swap_cgroup_priority.h
+++ b/mm/swap_cgroup_priority.h
@@ -47,6 +47,22 @@ struct swap_cgroup_priority *inherit_swap_cgroup_priority(
bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg, swp_entry_t *entry,
int order);
void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
+void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si);
+
+bool alloc_percpu_swap_cluster(struct swap_info_struct *si);
+void free_percpu_swap_cluster(struct swap_info_struct *si);
+static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
+ int order,
+ unsigned int next)
+{
+ this_cpu_write(si->percpu_cluster->next[order], next);
+}
+
+static inline unsigned int read_percpu_swap_cluster_next(
+ struct swap_info_struct *si, int order)
+{
+ return __this_cpu_read(si->percpu_cluster->next[order]);
+}
#else
int swap_node(struct swap_info_struct *si);
unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
@@ -85,5 +101,28 @@ static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
{
}
+static inline void flush_swap_cgroup_priority_percpu_swapdev(
+ struct swap_info_struct *si)
+{
+}
+static inline bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
+{
+ return true;
+}
+static inline void free_percpu_swap_cluster(struct swap_info_struct *si)
+{
+}
+static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
+ int order,
+ unsigned int next)
+{
+ return;
+}
+
+static inline unsigned int read_percpu_swap_cluster_next(
+ struct swap_info_struct *si, int order)
+{
+ return SWAP_ENTRY_INVALID;
+}
#endif
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bfd0532ad250..6a5ac9962e9f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -817,12 +817,15 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
+
if (si->flags & SWP_SOLIDSTATE) {
this_cpu_write(percpu_swap_cluster.offset[order], next);
this_cpu_write(percpu_swap_cluster.si[order], si);
+ write_percpu_swap_cluster_next(si, order, next);
} else {
si->global_cluster->next[order] = next;
}
+
return found;
}
@@ -892,26 +895,29 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
if (order && !(si->flags & SWP_BLKDEV))
return 0;
- if (!(si->flags & SWP_SOLIDSTATE)) {
+ if (si->flags & SWP_SOLIDSTATE) {
+ offset = read_percpu_swap_cluster_next(si, order);
+ } else {
/* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
offset = si->global_cluster->next[order];
- if (offset == SWAP_ENTRY_INVALID)
- goto new_cluster;
+ }
- ci = lock_cluster(si, offset);
- /* Cluster could have been used by another order */
- if (cluster_is_usable(ci, order)) {
- if (cluster_is_empty(ci))
- offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset,
- order, usage);
- } else {
- unlock_cluster(ci);
- }
- if (found)
- goto done;
+ if (offset == SWAP_ENTRY_INVALID)
+ goto new_cluster;
+
+ ci = lock_cluster(si, offset);
+ /* Cluster could have been used by another order */
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset,
+ order, usage);
+ } else {
+ unlock_cluster(ci);
}
+ if (found)
+ goto done;
new_cluster:
ci = isolate_lock_cluster(si, &si->free_clusters);
@@ -991,6 +997,7 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
done:
if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
+
return found;
}
@@ -2674,6 +2681,8 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
for (i = 0; i < SWAP_NR_ORDERS; i++)
cmpxchg(&pcp_si[i], si, NULL);
}
+
+ flush_swap_cgroup_priority_percpu_swapdev(si);
}
@@ -2802,6 +2811,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
+ free_percpu_swap_cluster(p);
kfree(p->global_cluster);
p->global_cluster = NULL;
vfree(swap_map);
@@ -2900,7 +2910,6 @@ static void swap_stop(struct seq_file *swap, void *v)
mutex_unlock(&swapon_mutex);
}
-
#ifndef CONFIG_SWAP_CGROUP_PRIORITY
static int swap_show(struct seq_file *swap, void *v)
{
@@ -3239,7 +3248,10 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- if (!(si->flags & SWP_SOLIDSTATE)) {
+ if (si->flags & SWP_SOLIDSTATE) {
+ if (!alloc_percpu_swap_cluster(si))
+ goto err_free;
+ } else {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
@@ -3532,6 +3544,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
+ free_percpu_swap_cluster(si);
kfree(si->global_cluster);
si->global_cluster = NULL;
inode = NULL;
--
2.34.1
next prev parent reply other threads:[~2025-07-16 20:21 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-16 20:20 [PATCH 0/4] mm/swap, memcg: Support per-cgroup swap device priorities Youngjun Park
2025-07-16 20:20 ` [PATCH 1/4] mm/swap, memcg: Introduce infrastructure for cgroup-based swap priority Youngjun Park
2025-07-17 11:20 ` kernel test robot
2025-07-22 14:09 ` YoungJun Park
2025-07-18 17:08 ` kernel test robot
2025-07-22 14:11 ` YoungJun Park
2025-07-21 15:13 ` kernel test robot
2025-07-22 14:14 ` YoungJun Park
2025-07-22 8:41 ` Michal Koutný
2025-07-22 14:05 ` YoungJun Park
2025-07-22 18:41 ` YoungJun Park
2025-08-14 14:03 ` Michal Koutný
2025-08-15 15:10 ` Chris Li
2025-08-16 17:21 ` YoungJun Park
2025-08-16 19:15 ` Chris Li
2025-08-19 10:12 ` YoungJun Park
2025-08-20 0:52 ` Chris Li
2025-08-20 14:39 ` YoungJun Park
2025-08-21 20:39 ` Chris Li
2025-08-22 5:45 ` YoungJun Park
2025-08-22 16:48 ` Chris Li
2025-08-24 12:05 ` YoungJun Park
2025-08-26 8:19 ` Chris Li
2025-08-26 12:57 ` YoungJun Park
2025-08-26 14:30 ` Chris Li
2025-08-30 4:05 ` YoungJun Park
2025-08-30 7:13 ` Chris Li
2025-08-31 13:53 ` YoungJun Park
2025-08-31 16:45 ` Chris Li
2025-09-01 16:03 ` YoungJun Park
2025-09-01 16:06 ` YoungJun Park
2025-09-01 22:40 ` Chris Li
2025-09-03 9:32 ` Chris Li
2025-08-24 14:19 ` YoungJun Park
2025-08-16 16:41 ` YoungJun Park
2025-07-16 20:20 ` [PATCH 2/4] mm: swap: Apply per-cgroup swap priority mechanism to swap layer Youngjun Park
2025-07-16 20:20 ` [PATCH 3/4] mm: memcg: Add swap cgroup priority inheritance mechanism Youngjun Park
2025-07-16 20:20 ` Youngjun Park [this message]
2025-07-22 17:44 ` [PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters Kairui Song
2025-07-22 18:30 ` YoungJun Park
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250716202006.3640584-5-youngjun.park@lge.com \
--to=youngjun.park@lge.com \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=cgroups@vger.kernel.org \
--cc=chrisl@kernel.org \
--cc=gunho.lee@lge.com \
--cc=hannes@cmpxchg.org \
--cc=iamjoonsoo.kim@lge.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=taejoon.song@lge.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).