[PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters

cgroups.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Youngjun Park <youngjun.park@lge.com>
To: akpm@linux-foundation.org, hannes@cmpxchg.org
Cc: mhocko@kernel.org, roman.gushchin@linux.dev,
	shakeel.butt@linux.dev, muchun.song@linux.dev,
	shikemeng@huaweicloud.com, kasong@tencent.com, nphamcs@gmail.com,
	bhe@redhat.com, baohua@kernel.org, chrisl@kernel.org,
	cgroups@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, gunho.lee@lge.com,
	iamjoonsoo.kim@lge.com, taejoon.song@lge.com,
	Youngjun Park <youngjun.park@lge.com>
Subject: [PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters
Date: Thu, 17 Jul 2025 05:20:06 +0900	[thread overview]
Message-ID: <20250716202006.3640584-5-youngjun.park@lge.com> (raw)
In-Reply-To: <20250716202006.3640584-1-youngjun.park@lge.com>

This patch introduces a new swap allocation mechanism that supports
per-cgroup per-CPU swap device caches, combined with per-device per-CPU
cluster management.

The existing global swap allocator uses a per-CPU device cache and
cluster, shared by all cgroups. Under this model, per-cgroup swap
priorities cannot be effectively honored on the fast path, as allocations
do not distinguish between cgroups.

To address this, we introduce per-cgroup per-CPU swap device caches.
This allows fast-path swap allocations to respect each cgroup’s
individual priority settings.

To avoid an explosion of cluster structures proportional to the number
of cgroups, clusters remain per-device and are shared across cgroups.
This strikes a balance between performance and memory overhead.

Suggested-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
 include/linux/swap.h      |   7 ++
 mm/swap_cgroup_priority.c | 156 +++++++++++++++++++++++++++++++++++++-
 mm/swap_cgroup_priority.h |  39 ++++++++++
 mm/swapfile.c             |  47 +++++++-----
 4 files changed, 228 insertions(+), 21 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bfddbec2ee28..ab15f4c103a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -283,6 +283,12 @@ enum swap_cluster_flags {
 #define SWAP_NR_ORDERS		1
 #endif
 
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+struct percpu_cluster {
+	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+};
+#endif
+
 /*
  * We keep using same cluster for rotational device so IO will be sequential.
  * The purpose is to optimize SWAP throughput on these device.
@@ -341,6 +347,7 @@ struct swap_info_struct {
 	struct list_head discard_clusters; /* discard clusters list */
 #ifdef CONFIG_SWAP_CGROUP_PRIORITY
 	u64 id;
+	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 #endif
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
index 84e876b77f01..f960c3dcab48 100644
--- a/mm/swap_cgroup_priority.c
+++ b/mm/swap_cgroup_priority.c
@@ -21,6 +21,17 @@
 #include "swap_cgroup_priority.h"
 #include "memcontrol-v1.h"
 
+/*
+ * We do maintain a cache on a per-cgroup-per-swap-device basis.
+ * However, the underlying cluster cache itself is managed
+ * per-swap-device. This design prevents each individual
+ * swap_cgroup_priority entry from caching its own cluster data,
+ * even as the number of such entries increases.
+ */
+struct percpu_swap_device {
+	struct swap_info_struct *si[SWAP_NR_ORDERS];
+};
+
 static DEFINE_MUTEX(swap_cgroup_priority_inherit_lck);
 static LIST_HEAD(swap_cgroup_priority_list);
 
@@ -49,6 +60,7 @@ static LIST_HEAD(swap_cgroup_priority_list);
  * least_priority - Current lowest priority.
  * distance - Priority differences from global swap priority.
  * default_prio - Default priority for this cgroup.
+ * pcpu_swapdev - Per-CPU swap device.
  * plist - Priority list head.
  */
 struct swap_cgroup_priority {
@@ -64,6 +76,7 @@ struct swap_cgroup_priority {
 	int least_priority;
 	s8 distance;
 	int default_prio;
+	struct percpu_swap_device __percpu *pcpu_swapdev;
 	struct plist_head plist[];
 };
 
@@ -132,6 +145,21 @@ static struct swap_cgroup_priority *get_effective_swap_cgroup_priority(
 	return swap_priority->effective;
 }
 
+static struct swap_cgroup_priority *get_effective_swap_cgroup_priority_rcu(
+	struct mem_cgroup *memcg)
+{
+	struct swap_cgroup_priority *swap_priority;
+
+	if (!memcg)
+		return NULL;
+
+	swap_priority = rcu_dereference(memcg->swap_priority);
+	if (!swap_priority)
+		return NULL;
+
+	return rcu_dereference(swap_priority->effective);
+}
+
 static bool validate_effective_swap_cgroup_priority(
 	struct mem_cgroup *memcg,
 	struct swap_cgroup_priority **swap_priority)
@@ -172,6 +200,9 @@ static void free_swap_cgroup_priority_pnode(
 static void free_swap_cgroup_priority(
 	struct swap_cgroup_priority *swap_priority)
 {
+	if (swap_priority->pcpu_swapdev)
+		free_percpu(swap_priority->pcpu_swapdev);
+
 	for (int i = 0; i < MAX_SWAPFILES; i++)
 		free_swap_cgroup_priority_pnode(swap_priority->pnode[i]);
 
@@ -187,6 +218,12 @@ static struct swap_cgroup_priority *alloc_swap_cgroup_priority(void)
 	if (!swap_priority)
 		return NULL;
 
+	swap_priority->pcpu_swapdev = alloc_percpu(struct percpu_swap_device);
+	if (!swap_priority->pcpu_swapdev) {
+		kvfree(swap_priority);
+		return NULL;
+	}
+
 	/*
 	 * Pre-allocates pnode array up to nr_swapfiles at init.
 	 * Individual pnodes are assigned on swapon, but not freed
@@ -326,10 +363,34 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
 	unsigned long offset;
 	int node;
 
-	/*
-	 * TODO: Per-cpu swap cluster cache can't be used directly
-	 * as cgroup-specific priorities may select different devices.
-	 */
+	rcu_read_lock();
+	if (!(swap_priority = get_effective_swap_cgroup_priority_rcu(memcg))) {
+		rcu_read_unlock();
+		return false;
+	}
+
+	/* Fast path */
+	si = this_cpu_read(swap_priority->pcpu_swapdev->si[order]);
+	if (si && get_swap_device_info(si)) {
+		offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+		if (offset) {
+			*entry = swp_entry(si->type, offset);
+			/*
+			 * Protected by 'percpu_swap_cluster' local_lock;
+			 * CPU migration is disabled during this operation.
+			 */
+			this_cpu_write(swap_priority->pcpu_swapdev->si[order],
+				       si);
+			put_swap_device(si);
+			rcu_read_unlock();
+
+			return true;
+		}
+		put_swap_device(si);
+	}
+	rcu_read_unlock();
+
+	/* Slow path */
 	spin_lock(&swap_avail_lock);
 	node = numa_node_id();
 
@@ -350,6 +411,14 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
 		if (get_swap_device_info(si)) {
 			offset = cluster_alloc_swap_entry(si, order,
 							  SWAP_HAS_CACHE);
+			/*
+			 * Protected by 'percpu_swap_cluster' local_lock;
+			 * CPU migration is disabled during this operation.
+			 */
+			if (memcg->swap_priority == swap_priority)
+				this_cpu_write(
+					swap_priority->pcpu_swapdev->si[order],
+					si);
 			put_swap_device(si);
 			if (offset) {
 				*entry = swp_entry(si->type, offset);
@@ -687,6 +756,21 @@ static int __apply_swap_cgroup_priority(
 	return 0;
 }
 
+static int init_swap_cgroup_priority_pcpu_swapdev_cache(
+	struct swap_cgroup_priority *swap_priority)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct percpu_swap_device *pcp_swap_dev =
+			per_cpu_ptr(swap_priority->pcpu_swapdev, cpu);
+		for (int i = 0; i < SWAP_NR_ORDERS; i++)
+			pcp_swap_dev->si[i] = NULL;
+	}
+
+	return 0;
+}
+
 /*
  * If this is the top-level swap_cgroup_priority, propagation is needed.
  * We traverse the 'mem_cgroup_tree' using 'for_each_mem_cgroup_tree'.
@@ -795,6 +879,8 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
 	for_each_node(nid)
 		plist_head_init(&swap_priority->plist[nid]);
 
+	init_swap_cgroup_priority_pcpu_swapdev_cache(swap_priority);
+
 prio_set:
 	spin_lock(&swap_lock);
 	spin_lock(&swap_avail_lock);
@@ -843,6 +929,23 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
 
 	spin_unlock(&swap_avail_lock);
 	spin_unlock(&swap_lock);
+	/*
+	 * XXX: We cannot fully synchronize with swap_alloc_cgroup_priority
+	 * when updating the next si.
+	 * Still, we ensure that flush operations inside swap_priority
+	 * are performed as reliably as possible.
+	 */
+	if (id != DEFAULT_ID &&
+	    swap_priority == swap_priority->effective && !new) {
+		int cpu;
+		struct swap_info_struct **pcp_si;
+		for_each_possible_cpu(cpu) {
+			pcp_si = per_cpu_ptr(
+				swap_priority->pcpu_swapdev->si, cpu);
+			for (int i = 0; i < SWAP_NR_ORDERS; i++)
+				pcp_si[i] = NULL;
+		}
+	}
 	mutex_unlock(&swap_cgroup_priority_inherit_lck);
 	return 0;
 
@@ -886,3 +989,48 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
 	spin_unlock(&swap_avail_lock);
 	mutex_unlock(&swap_cgroup_priority_inherit_lck);
 }
+
+void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si)
+{
+	int cpu, i;
+	struct swap_info_struct **pcp_si;
+	struct swap_cgroup_priority *swap_priority;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(swap_priority,
+				&swap_cgroup_priority_list, link) {
+		for_each_possible_cpu(cpu) {
+			pcp_si = per_cpu_ptr(
+					swap_priority->pcpu_swapdev->si, cpu);
+
+			for (i = 0; i < SWAP_NR_ORDERS; i++)
+				cmpxchg(&pcp_si[i], si, NULL);
+		}
+	}
+	rcu_read_unlock();
+}
+
+bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
+{
+	si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+	if (!si->percpu_cluster)
+		return false;
+
+	int cpu;
+	int i;
+	for_each_possible_cpu(cpu) {
+		struct percpu_cluster *cluster;
+
+		cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+		for (i = 0; i < SWAP_NR_ORDERS; i++)
+			cluster->next[i] = SWAP_ENTRY_INVALID;
+	}
+
+	return true;
+}
+
+void free_percpu_swap_cluster(struct swap_info_struct *si)
+{
+	free_percpu(si->percpu_cluster);
+	si->percpu_cluster = NULL;
+}
diff --git a/mm/swap_cgroup_priority.h b/mm/swap_cgroup_priority.h
index 5d16b63d12e0..815822ebd0d1 100644
--- a/mm/swap_cgroup_priority.h
+++ b/mm/swap_cgroup_priority.h
@@ -47,6 +47,22 @@ struct swap_cgroup_priority *inherit_swap_cgroup_priority(
 bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg, swp_entry_t *entry,
 				int order);
 void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
+void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si);
+
+bool alloc_percpu_swap_cluster(struct swap_info_struct *si);
+void free_percpu_swap_cluster(struct swap_info_struct *si);
+static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
+						  int order,
+						  unsigned int next)
+{
+	this_cpu_write(si->percpu_cluster->next[order], next);
+}
+
+static inline unsigned int read_percpu_swap_cluster_next(
+	struct swap_info_struct *si, int order)
+{
+        return __this_cpu_read(si->percpu_cluster->next[order]);
+}
 #else
 int swap_node(struct swap_info_struct *si);
 unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
@@ -85,5 +101,28 @@ static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
 static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
 {
 }
+static inline void flush_swap_cgroup_priority_percpu_swapdev(
+	struct swap_info_struct *si)
+{
+}
+static inline bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
+{
+	return true;
+}
+static inline void free_percpu_swap_cluster(struct swap_info_struct *si)
+{
+}
+static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
+						  int order,
+						  unsigned int next)
+{
+	return;
+}
+
+static inline unsigned int read_percpu_swap_cluster_next(
+	struct swap_info_struct *si, int order)
+{
+	return SWAP_ENTRY_INVALID;
+}
 #endif
 #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bfd0532ad250..6a5ac9962e9f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -817,12 +817,15 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	unlock_cluster(ci);
+
 	if (si->flags & SWP_SOLIDSTATE) {
 		this_cpu_write(percpu_swap_cluster.offset[order], next);
 		this_cpu_write(percpu_swap_cluster.si[order], si);
+		write_percpu_swap_cluster_next(si, order, next);
 	} else {
 		si->global_cluster->next[order] = next;
 	}
+
 	return found;
 }
 
@@ -892,26 +895,29 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
 	if (order && !(si->flags & SWP_BLKDEV))
 		return 0;
 
-	if (!(si->flags & SWP_SOLIDSTATE)) {
+	if (si->flags & SWP_SOLIDSTATE) {
+		offset = read_percpu_swap_cluster_next(si, order);
+	} else {
 		/* Serialize HDD SWAP allocation for each device. */
 		spin_lock(&si->global_cluster_lock);
 		offset = si->global_cluster->next[order];
-		if (offset == SWAP_ENTRY_INVALID)
-			goto new_cluster;
+	}
 
-		ci = lock_cluster(si, offset);
-		/* Cluster could have been used by another order */
-		if (cluster_is_usable(ci, order)) {
-			if (cluster_is_empty(ci))
-				offset = cluster_offset(si, ci);
-			found = alloc_swap_scan_cluster(si, ci, offset,
-							order, usage);
-		} else {
-			unlock_cluster(ci);
-		}
-		if (found)
-			goto done;
+	if (offset == SWAP_ENTRY_INVALID)
+		goto new_cluster;
+
+	ci = lock_cluster(si, offset);
+	/* Cluster could have been used by another order */
+	if (cluster_is_usable(ci, order)) {
+		if (cluster_is_empty(ci))
+			offset = cluster_offset(si, ci);
+		found = alloc_swap_scan_cluster(si, ci, offset,
+						order, usage);
+	} else {
+		unlock_cluster(ci);
 	}
+	if (found)
+		goto done;
 
 new_cluster:
 	ci = isolate_lock_cluster(si, &si->free_clusters);
@@ -991,6 +997,7 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
 done:
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster_lock);
+
 	return found;
 }
 
@@ -2674,6 +2681,8 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
 		for (i = 0; i < SWAP_NR_ORDERS; i++)
 			cmpxchg(&pcp_si[i], si, NULL);
 	}
+
+	flush_swap_cgroup_priority_percpu_swapdev(si);
 }
 
 
@@ -2802,6 +2811,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	arch_swap_invalidate_area(p->type);
 	zswap_swapoff(p->type);
 	mutex_unlock(&swapon_mutex);
+	free_percpu_swap_cluster(p);
 	kfree(p->global_cluster);
 	p->global_cluster = NULL;
 	vfree(swap_map);
@@ -2900,7 +2910,6 @@ static void swap_stop(struct seq_file *swap, void *v)
 	mutex_unlock(&swapon_mutex);
 }
 
-
 #ifndef CONFIG_SWAP_CGROUP_PRIORITY
 static int swap_show(struct seq_file *swap, void *v)
 {
@@ -3239,7 +3248,10 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	for (i = 0; i < nr_clusters; i++)
 		spin_lock_init(&cluster_info[i].lock);
 
-	if (!(si->flags & SWP_SOLIDSTATE)) {
+	if (si->flags & SWP_SOLIDSTATE) {
+		if (!alloc_percpu_swap_cluster(si))
+			goto err_free;
+	} else {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
 				     GFP_KERNEL);
 		if (!si->global_cluster)
@@ -3532,6 +3544,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap_unlock_inode:
 	inode_unlock(inode);
 bad_swap:
+	free_percpu_swap_cluster(si);
 	kfree(si->global_cluster);
 	si->global_cluster = NULL;
 	inode = NULL;
-- 
2.34.1

next prev parent reply	other threads:[~2025-07-16 20:21 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-16 20:20 [PATCH 0/4] mm/swap, memcg: Support per-cgroup swap device priorities Youngjun Park
2025-07-16 20:20 ` [PATCH 1/4] mm/swap, memcg: Introduce infrastructure for cgroup-based swap priority Youngjun Park
2025-07-17 11:20   ` kernel test robot
2025-07-22 14:09     ` YoungJun Park
2025-07-18 17:08   ` kernel test robot
2025-07-22 14:11     ` YoungJun Park
2025-07-21 15:13   ` kernel test robot
2025-07-22 14:14     ` YoungJun Park
2025-07-22  8:41   ` Michal Koutný
2025-07-22 14:05     ` YoungJun Park
2025-07-22 18:41       ` YoungJun Park
2025-08-14 14:03         ` Michal Koutný
2025-08-15 15:10           ` Chris Li
2025-08-16 17:21             ` YoungJun Park
2025-08-16 19:15               ` Chris Li
2025-08-19 10:12                 ` YoungJun Park
2025-08-20  0:52                   ` Chris Li
2025-08-20 14:39                     ` YoungJun Park
2025-08-21 20:39                       ` Chris Li
2025-08-22  5:45                         ` YoungJun Park
2025-08-22 16:48                           ` Chris Li
2025-08-24 12:05                             ` YoungJun Park
2025-08-26  8:19                               ` Chris Li
2025-08-26 12:57                                 ` YoungJun Park
2025-08-26 14:30                                   ` Chris Li
2025-08-30  4:05                                     ` YoungJun Park
2025-08-30  7:13                                       ` Chris Li
2025-08-31 13:53                                         ` YoungJun Park
2025-08-31 16:45                                           ` Chris Li
2025-09-01 16:03                                             ` YoungJun Park
2025-09-01 16:06                                             ` YoungJun Park
2025-09-01 22:40                                               ` Chris Li
2025-09-03  9:32                                                 ` Chris Li
2025-08-24 14:19                             ` YoungJun Park
2025-08-16 16:41           ` YoungJun Park
2025-07-16 20:20 ` [PATCH 2/4] mm: swap: Apply per-cgroup swap priority mechanism to swap layer Youngjun Park
2025-07-16 20:20 ` [PATCH 3/4] mm: memcg: Add swap cgroup priority inheritance mechanism Youngjun Park
2025-07-16 20:20 ` Youngjun Park [this message]
2025-07-22 17:44   ` [PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters Kairui Song
2025-07-22 18:30     ` YoungJun Park

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:bfddbec2ee2 dfblob:ab15f4c103a dfblob:84e876b77f0
dfblob:f960c3dcab4 dfblob:5d16b63d12e dfblob:815822ebd0d
dfblob:bfd0532ad25 dfblob:6a5ac9962e9 )
 OR (
bs:"[PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with shared clusters" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250716202006.3640584-5-youngjun.park@lge.com \
    --to=youngjun.park@lge.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chrisl@kernel.org \
    --cc=gunho.lee@lge.com \
    --cc=hannes@cmpxchg.org \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=taejoon.song@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).