Linux cgroups development
 help / color / mirror / Atom feed
* [RFC PATCH 1/5] mm, swap: add virtual swap device infrastructure
From: Nhat Pham @ 2026-05-28 21:29 UTC (permalink / raw)
  To: kasong
  Cc: Liam.Howlett, akpm, apopple, axelrasmussen, baohua, baolin.wang,
	bhe, byungchul, cgroups, chengming.zhou, chrisl, corbet, david,
	dev.jain, gourry, hannes, hughd, jannh, joshua.hahnjy, lance.yang,
	lenb, linux-doc, linux-kernel, linux-mm, linux-pm,
	lorenzo.stoakes, matthew.brost, mhocko, muchun.song, npache,
	nphamcs, pavel, peterx, peterz, pfalcato, rafael, rakie.kim,
	roman.gushchin, rppt, ryan.roberts, shakeel.butt, shikemeng,
	surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed, yuanchu,
	zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>

Create a massive virtual swap device at boot, along with the
dynamic cluster infrastructure that the rest of the vswap layer
is built on:

  - swap_cluster_info_dynamic: per-cluster dynamic info kept in
    an xarray, allowing arbitrary-size devices without the static
    cluster_info[] array.
  - virtual_table: a per-slot side table for vswap backend metadata
    (tag-encoded in low bits). The field itself is added in the
    next patch; this commit only introduces the dynamic cluster
    container that will hold it.
  - The size of the vswap device is ALIGN_DOWN(UINT_MAX,
    SWAPFILE_CLUSTER) pages.

Gated by a new CONFIG_VSWAP (depends on SWAP && 64BIT). For now,
the vswap device cannot be swapon'd or swapoff'd — it is created
unconditionally at boot when CONFIG_VSWAP=y and lives for the
lifetime of the kernel. The SWP_VSWAP flag and swap_is_vswap()
helper let hot paths skip per-device bookkeeping that doesn't
apply (avail-list management, percpu_ref get/put, hibernation
target lookup, etc.).

This patch is pure scaffolding: it introduces the device, the
dynamic-cluster machinery, and the general shape of a vswap
allocator (with sanity checks), but does not hook the vswap device
into any allocation path. folio_alloc_swap will not produce vswap
entries until a subsequent patch wires it in. Backends (zswap,
zero, physical disk) and the vswap-aware swap-out / swap-in /
writeback paths arrive in subsequent patches.

Suggested-by: Kairui Song <kasong@tencent.com>
Co-developed-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 MAINTAINERS          |   1 +
 include/linux/swap.h |   4 +
 mm/Kconfig           |  10 ++
 mm/page_io.c         |  18 ++-
 mm/swap.h            |  46 ++++++--
 mm/swap_state.c      |  43 ++++---
 mm/swap_table.h      |   2 +
 mm/swapfile.c        | 264 +++++++++++++++++++++++++++++++++++++++----
 mm/vswap.h           |  29 +++++
 mm/zswap.c           |  10 +-
 10 files changed, 375 insertions(+), 52 deletions(-)
 create mode 100644 mm/vswap.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9be179722d42..e96bd0bf6307 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17041,6 +17041,7 @@ F:	mm/swap.h
 F:	mm/swap_table.h
 F:	mm/swap_state.c
 F:	mm/swapfile.c
+F:	mm/vswap.h
 
 MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE)
 M:	Andrew Morton <akpm@linux-foundation.org>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6d72778e6cc3..ee9b1e76b058 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,6 +214,7 @@ enum {
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
 	SWP_HIBERNATION = (1 << 13),	/* pinned for hibernation */
+	SWP_VSWAP	= (1 << 14),	/* virtual swap device */
 					/* add others here before... */
 };
 
@@ -282,6 +283,7 @@ struct swap_info_struct {
 	struct work_struct reclaim_work; /* reclaim worker */
 	struct list_head discard_clusters; /* discard clusters list */
 	struct plist_node avail_list;   /* entry in swap_avail_head */
+	struct xarray cluster_info_pool; /* Xarray for vswap dynamic cluster info */
 };
 
 static inline swp_entry_t page_swap_entry(struct page *page)
@@ -473,6 +475,8 @@ void swap_free_hibernation_slot(swp_entry_t entry);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
+	if (si->flags & SWP_VSWAP)
+		return;
 	percpu_ref_put(&si->users);
 }
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 776b67c66e82..fc395ae3dde8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -19,6 +19,16 @@ menuconfig SWAP
 	  used to provide more virtual memory than the actual RAM present
 	  in your computer.  If unsure say Y.
 
+config VSWAP
+	bool "Virtual swap device"
+	depends on SWAP && 64BIT
+	help
+	  Adds a virtual swap layer that decouples swap entries in page
+	  tables from physical backing storage. Swap entries are allocated
+	  from a virtual swap device and can be backed by zswap, a physical
+	  swapfile, or kept in memory — with the backing changeable at
+	  runtime without invalidating page table entries.
+
 config ZSWAP
 	bool "Compressed cache for swap pages"
 	depends on SWAP
diff --git a/mm/page_io.c b/mm/page_io.c
index f2d8fe7fd057..8126be6e4cfb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -295,8 +295,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	}
 	rcu_read_unlock();
 
-	__swap_writepage(folio, swap_plug);
-	return 0;
+	return __swap_writepage(folio, swap_plug);
 out_unlock:
 	folio_unlock(folio);
 	return ret;
@@ -458,11 +457,18 @@ static void swap_writepage_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+	if (sis->flags & SWP_VSWAP) {
+		/* Prevent the page from getting reclaimed. */
+		folio_set_dirty(folio);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
+
 	/*
 	 * ->flags can be updated non-atomically,
 	 * but that will never affect SWP_FS_OPS, so the data_race
@@ -479,6 +485,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 		swap_writepage_bdev_sync(folio, sis);
 	else
 		swap_writepage_bdev_async(folio, sis);
+	return 0;
 }
 
 void swap_write_unplug(struct swap_iocb *sio)
@@ -684,6 +691,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	if (zswap_load(folio) != -ENOENT)
 		goto finish;
 
+	if (unlikely(sis->flags & SWP_VSWAP)) {
+		folio_unlock(folio);
+		goto finish;
+	}
+
 	/* We have to read from slower devices. Increase zswap protection. */
 	zswap_folio_swapin(folio);
 
diff --git a/mm/swap.h b/mm/swap.h
index 81c06aae7ccd..479ee5871cb9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -65,6 +65,13 @@ struct swap_cluster_info {
 	struct list_head list;
 };
 
+struct swap_cluster_info_dynamic {
+	struct swap_cluster_info ci;	/* Underlying cluster info */
+	unsigned int index;		/* for cluster_index() */
+	struct rcu_head rcu;		/* For kfree_rcu deferred free */
+	/* Backend pointers (virtual_table) added in a later patch. */
+};
+
 /* All on-list cluster must have a non-zero flag. */
 enum swap_cluster_flags {
 	CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
@@ -75,6 +82,7 @@ enum swap_cluster_flags {
 	CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
 	CLUSTER_FLAG_FULL,
 	CLUSTER_FLAG_DISCARD,
+	CLUSTER_FLAG_DEAD,	/* Vswap dynamic cluster pending kfree_rcu */
 	CLUSTER_FLAG_MAX,
 };
 
@@ -108,9 +116,19 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
 static inline struct swap_cluster_info *__swap_offset_to_cluster(
 		struct swap_info_struct *si, pgoff_t offset)
 {
+	unsigned int cluster_idx = offset / SWAPFILE_CLUSTER;
+
 	VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
 	VM_WARN_ON_ONCE(offset >= roundup(si->max, SWAPFILE_CLUSTER));
-	return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+
+	if (si->flags & SWP_VSWAP) {
+		struct swap_cluster_info_dynamic *ci_dyn;
+
+		ci_dyn = xa_load(&si->cluster_info_pool, cluster_idx);
+		return ci_dyn ? &ci_dyn->ci : NULL;
+	}
+
+	return &si->cluster_info[cluster_idx];
 }
 
 static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
@@ -122,7 +140,7 @@ static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entr
 static __always_inline struct swap_cluster_info *__swap_cluster_lock(
 		struct swap_info_struct *si, unsigned long offset, bool irq)
 {
-	struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
+	struct swap_cluster_info *ci;
 
 	/*
 	 * Nothing modifies swap cache in an IRQ context. All access to
@@ -135,10 +153,24 @@ static __always_inline struct swap_cluster_info *__swap_cluster_lock(
 	 */
 	VM_WARN_ON_ONCE(!in_task());
 	VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
-	if (irq)
-		spin_lock_irq(&ci->lock);
-	else
-		spin_lock(&ci->lock);
+
+	rcu_read_lock();
+	ci = __swap_offset_to_cluster(si, offset);
+	if (ci) {
+		if (irq)
+			spin_lock_irq(&ci->lock);
+		else
+			spin_lock(&ci->lock);
+
+		if (ci->flags == CLUSTER_FLAG_DEAD) {
+			if (irq)
+				spin_unlock_irq(&ci->lock);
+			else
+				spin_unlock(&ci->lock);
+			ci = NULL;
+		}
+	}
+	rcu_read_unlock();
 	return ci;
 }
 
@@ -250,7 +282,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 }
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swap_space __read_mostly;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 04f5ce992401..b063c47138c5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -90,8 +90,10 @@ struct folio *swap_cache_get_folio(swp_entry_t entry)
 	struct folio *folio;
 
 	for (;;) {
+		rcu_read_lock();
 		swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
 					swp_cluster_offset(entry));
+		rcu_read_unlock();
 		if (!swp_tb_is_folio(swp_tb))
 			return NULL;
 		folio = swp_tb_to_folio(swp_tb);
@@ -113,8 +115,10 @@ bool swap_cache_has_folio(swp_entry_t entry)
 {
 	unsigned long swp_tb;
 
+	rcu_read_lock();
 	swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
 				swp_cluster_offset(entry));
+	rcu_read_unlock();
 	return swp_tb_is_folio(swp_tb);
 }
 
@@ -130,8 +134,10 @@ void *swap_cache_get_shadow(swp_entry_t entry)
 {
 	unsigned long swp_tb;
 
+	rcu_read_lock();
 	swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
 				swp_cluster_offset(entry));
+	rcu_read_unlock();
 	if (swp_tb_is_shadow(swp_tb))
 		return swp_tb_to_shadow(swp_tb);
 	return NULL;
@@ -400,14 +406,16 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
  * -ENOENT / -EEXIST: Target swap entry is unavailable or cached, the caller
  *                    should abort or try to use the cached folio instead
  */
-static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
-					swp_entry_t targ_entry, gfp_t gfp,
+static struct folio *__swap_cache_alloc(swp_entry_t targ_entry, gfp_t gfp,
 					unsigned int order, struct vm_fault *vmf,
 					struct mempolicy *mpol, pgoff_t ilx)
 {
 	int err;
 	swp_entry_t entry;
 	struct folio *folio;
+	struct swap_cluster_info *ci;
+	struct swap_info_struct *si = __swap_entry_to_info(targ_entry);
+	unsigned long offset = swp_offset(targ_entry);
 	void *shadow = NULL;
 	unsigned short memcg_id;
 	unsigned long address, nr_pages = 1UL << order;
@@ -417,9 +425,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
 	entry.val = round_down(targ_entry.val, nr_pages);
 
 	/* Check if the slot and range are available, skip allocation if not */
-	spin_lock(&ci->lock);
-	err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL, NULL);
-	spin_unlock(&ci->lock);
+	err = -ENOENT;
+	ci = swap_cluster_lock(si, offset);
+	if (ci) {
+		err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL, NULL);
+		swap_cluster_unlock(ci);
+	}
 	if (unlikely(err))
 		return ERR_PTR(err);
 
@@ -440,10 +451,13 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
 		return ERR_PTR(-ENOMEM);
 
 	/* Double check the range is still not in conflict */
-	spin_lock(&ci->lock);
-	err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow, &memcg_id);
+	err = -ENOENT;
+	ci = swap_cluster_lock(si, offset);
+	if (ci)
+		err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow, &memcg_id);
 	if (unlikely(err)) {
-		spin_unlock(&ci->lock);
+		if (ci)
+			swap_cluster_unlock(ci);
 		folio_put(folio);
 		return ERR_PTR(err);
 	}
@@ -451,13 +465,14 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
 	__folio_set_locked(folio);
 	__folio_set_swapbacked(folio);
 	__swap_cache_do_add_folio(ci, folio, entry);
-	spin_unlock(&ci->lock);
+	swap_cluster_unlock(ci);
 
 	if (mem_cgroup_swapin_charge_folio(folio, memcg_id,
 					   vmf ? vmf->vma->vm_mm : NULL, gfp)) {
-		spin_lock(&ci->lock);
+		/* The folio pins the cluster */
+		ci = swap_cluster_lock(si, offset);
 		__swap_cache_do_del_folio(ci, folio, entry, shadow);
-		spin_unlock(&ci->lock);
+		swap_cluster_unlock(ci);
 		folio_unlock(folio);
 		/* nr_pages refs from swap cache, 1 from allocation */
 		folio_put_refs(folio, nr_pages + 1);
@@ -501,9 +516,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
 {
 	int order, err;
 	struct folio *ret;
-	struct swap_cluster_info *ci;
 
-	ci = __swap_entry_to_cluster(targ_entry);
 	order = highest_order(orders);
 
 	/* orders must be non-zero, and must not exceed cluster size. */
@@ -511,12 +524,12 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
 		return ERR_PTR(-EINVAL);
 
 	do {
-		ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
+		ret = __swap_cache_alloc(targ_entry, gfp, order,
 					 vmf, mpol, ilx);
 		if (!IS_ERR(ret))
 			break;
 		err = PTR_ERR(ret);
-		if (!order || (err && err != -EBUSY && err != -ENOMEM))
+		if (err && err != -EBUSY && err != -ENOMEM)
 			break;
 		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
 		order = next_order(&orders, order);
diff --git a/mm/swap_table.h b/mm/swap_table.h
index e6613e62f8d0..fd7f0fb9836a 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -255,6 +255,8 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
 	unsigned long swp_tb;
 
 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+	if (!ci)
+		return SWP_TB_NULL;
 
 	rcu_read_lock();
 	table = rcu_dereference(ci->table);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a9a1e477fec9..f6d2529159ff 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,10 +42,12 @@
 #include <linux/suspend.h>
 #include <linux/zswap.h>
 #include <linux/plist.h>
+#include <linux/major.h>
 
 #include <asm/tlbflush.h>
 #include <linux/leafops.h>
 #include "swap_table.h"
+#include "vswap.h"
 #include "internal.h"
 #include "swap.h"
 
@@ -401,6 +403,8 @@ static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
 static inline unsigned int cluster_index(struct swap_info_struct *si,
 					 struct swap_cluster_info *ci)
 {
+	if (si->flags & SWP_VSWAP)
+		return container_of(ci, struct swap_cluster_info_dynamic, ci)->index;
 	return ci - si->cluster_info;
 }
 
@@ -734,6 +738,22 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
 		return;
 	}
 
+	if (si->flags & SWP_VSWAP) {
+		struct swap_cluster_info_dynamic *ci_dyn;
+
+		ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+		if (ci->flags != CLUSTER_FLAG_NONE) {
+			spin_lock(&si->lock);
+			list_del(&ci->list);
+			spin_unlock(&si->lock);
+		}
+		swap_cluster_free_table(ci);
+		xa_erase(&si->cluster_info_pool, ci_dyn->index);
+		ci->flags = CLUSTER_FLAG_DEAD;
+		kfree_rcu(ci_dyn, rcu);
+		return;
+	}
+
 	__free_cluster(si, ci);
 }
 
@@ -836,14 +856,21 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
  * stolen by a lower order). @usable will be set to false if that happens.
  */
 static bool cluster_reclaim_range(struct swap_info_struct *si,
-				  struct swap_cluster_info *ci,
+				  struct swap_cluster_info **pcip,
 				  unsigned long start, unsigned int order,
 				  bool *usable)
 {
+	struct swap_cluster_info *ci = *pcip;
 	unsigned int nr_pages = 1 << order;
 	unsigned long offset = start, end = start + nr_pages;
 	unsigned long swp_tb;
 
+	/*
+	 * Take RCU read lock before releasing the cluster lock to keep ci
+	 * alive — for vswap dynamic clusters, ci is freed via kfree_rcu
+	 * and the grace period could otherwise elapse in the window.
+	 */
+	rcu_read_lock();
 	spin_unlock(&ci->lock);
 	do {
 		swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
@@ -853,7 +880,15 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
 				break;
 	} while (++offset < end);
-	spin_lock(&ci->lock);
+	rcu_read_unlock();
+
+	/* Re-lookup: dynamic cluster may have been freed while lock was dropped */
+	ci = swap_cluster_lock(si, start);
+	*pcip = ci;
+	if (!ci) {
+		*usable = false;
+		return false;
+	}
 
 	/*
 	 * We just dropped ci->lock so cluster could be used by another
@@ -984,7 +1019,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 		if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
 			continue;
 		if (need_reclaim) {
-			ret = cluster_reclaim_range(si, ci, offset, order, &usable);
+			ret = cluster_reclaim_range(si, &ci, offset, order,
+						    &usable);
 			if (!usable)
 				goto out;
 			if (cluster_is_empty(ci))
@@ -1002,8 +1038,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 		break;
 	}
 out:
-	relocate_cluster(si, ci);
-	swap_cluster_unlock(ci);
+	if (ci) {
+		relocate_cluster(si, ci);
+		swap_cluster_unlock(ci);
+	}
 	if (si->flags & SWP_SOLIDSTATE) {
 		this_cpu_write(percpu_swap_cluster.offset[order], next);
 		this_cpu_write(percpu_swap_cluster.si[order], si);
@@ -1035,6 +1073,41 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
 	return found;
 }
 
+static unsigned int alloc_swap_scan_dynamic(struct swap_info_struct *si,
+					    struct folio *folio)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	struct swap_cluster_info *ci;
+	unsigned long offset;
+
+	WARN_ON(!(si->flags & SWP_VSWAP));
+
+	ci_dyn = kzalloc(sizeof(*ci_dyn), GFP_ATOMIC);
+	if (!ci_dyn)
+		return SWAP_ENTRY_INVALID;
+
+	spin_lock_init(&ci_dyn->ci.lock);
+	INIT_LIST_HEAD(&ci_dyn->ci.list);
+
+	if (swap_cluster_alloc_table(&ci_dyn->ci, GFP_ATOMIC)) {
+		kfree(ci_dyn);
+		return SWAP_ENTRY_INVALID;
+	}
+
+	if (xa_alloc(&si->cluster_info_pool, &ci_dyn->index, ci_dyn,
+		     XA_LIMIT(1, DIV_ROUND_UP(si->max, SWAPFILE_CLUSTER) - 1),
+		     GFP_ATOMIC)) {
+		swap_cluster_free_table(&ci_dyn->ci);
+		kfree(ci_dyn);
+		return SWAP_ENTRY_INVALID;
+	}
+
+	ci = &ci_dyn->ci;
+	spin_lock(&ci->lock);
+	offset = cluster_offset(si, ci);
+	return alloc_swap_scan_cluster(si, ci, folio, offset);
+}
+
 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 {
 	long to_scan = 1;
@@ -1057,7 +1130,9 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 				spin_unlock(&ci->lock);
 				nr_reclaim = __try_to_reclaim_swap(si, offset,
 								   TTRS_ANYWAY);
-				spin_lock(&ci->lock);
+				ci = swap_cluster_lock(si, offset);
+				if (!ci)
+					goto next;
 				if (nr_reclaim) {
 					offset += abs(nr_reclaim);
 					continue;
@@ -1071,6 +1146,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 			relocate_cluster(si, ci);
 
 		swap_cluster_unlock(ci);
+next:
 		if (to_scan <= 0)
 			break;
 		cond_resched();
@@ -1141,6 +1217,12 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 			goto done;
 	}
 
+	if (si->flags & SWP_VSWAP) {
+		found = alloc_swap_scan_dynamic(si, folio);
+		if (found)
+			goto done;
+	}
+
 	if (!(si->flags & SWP_PAGE_DISCARD)) {
 		found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
 		if (found)
@@ -1259,6 +1341,13 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 			goto skip;
 	}
 
+	/*
+	 * Keep vswap off the avail list — it is not allocated from by
+	 * the physical swap allocator (swap_alloc_fast/slow).
+	 */
+	if (swap_is_vswap(si))
+		goto skip;
+
 	plist_add(&si->avail_list, &swap_avail_head);
 
 skip:
@@ -1341,6 +1430,10 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 
 static bool get_swap_device_info(struct swap_info_struct *si)
 {
+	/* vswap device is always alive — no ref counting needed */
+	if (swap_is_vswap(si))
+		return true;
+
 	if (!percpu_ref_tryget_live(&si->users))
 		return false;
 	/*
@@ -1376,11 +1469,11 @@ static bool swap_alloc_fast(struct folio *folio)
 		return false;
 
 	ci = swap_cluster_lock(si, offset);
-	if (cluster_is_usable(ci, order)) {
+	if (ci && cluster_is_usable(ci, order)) {
 		if (cluster_is_empty(ci))
 			offset = cluster_offset(si, ci);
 		alloc_swap_scan_cluster(si, ci, folio, offset);
-	} else {
+	} else if (ci) {
 		swap_cluster_unlock(ci);
 	}
 
@@ -1484,6 +1577,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
 	if (!si)
 		return 0;
 
+	/* Entry is in use (being faulted in), so its cluster is alive. */
 	ci = __swap_offset_to_cluster(si, offset);
 	ret = swap_extend_table_alloc(si, ci, gfp);
 
@@ -1711,6 +1805,7 @@ int folio_alloc_swap(struct folio *folio)
 	unsigned int order = folio_order(folio);
 	unsigned int size = 1 << order;
 
+	VM_WARN_ON_FOLIO(folio_test_swapcache(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
 
@@ -1873,7 +1968,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	return NULL;
 put_out:
 	pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
-	percpu_ref_put(&si->users);
+	if (!swap_is_vswap(si))
+		percpu_ref_put(&si->users);
 	return NULL;
 }
 
@@ -2005,6 +2101,7 @@ static bool folio_maybe_swapped(struct folio *folio)
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
 
+	/* Folio is locked and in swap cache, so ci->count > 0: cluster is alive. */
 	ci = __swap_entry_to_cluster(entry);
 	ci_off = swp_cluster_offset(entry);
 	ci_end = ci_off + folio_nr_pages(folio);
@@ -2142,9 +2239,9 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
 	pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]);
 	if (pcp_si == si && pcp_offset) {
 		ci = swap_cluster_lock(si, pcp_offset);
-		if (cluster_is_usable(ci, 0))
+		if (ci && cluster_is_usable(ci, 0))
 			offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset);
-		else
+		else if (ci)
 			swap_cluster_unlock(ci);
 	}
 	if (!offset)
@@ -2192,6 +2289,9 @@ static int __find_hibernation_swap_type(dev_t device, sector_t offset)
 
 		if (!(sis->flags & SWP_WRITEOK))
 			continue;
+		/* vswap has no bdev — never a hibernation target */
+		if (swap_is_vswap(sis))
+			continue;
 
 		if (device == sis->bdev->bd_dev) {
 			struct swap_extent *se = first_se(sis);
@@ -2379,6 +2479,9 @@ int find_first_swap(dev_t *device)
 
 		if (!(sis->flags & SWP_WRITEOK))
 			continue;
+		/* vswap has no bdev — never a hibernation target */
+		if (swap_is_vswap(sis))
+			continue;
 		*device = sis->bdev->bd_dev;
 		spin_unlock(&swap_lock);
 		return type;
@@ -2590,8 +2693,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 						&vmf);
 		}
 		if (!folio) {
+			rcu_read_lock();
 			swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
 						swp_cluster_offset(entry));
+			rcu_read_unlock();
 			if (swp_tb_get_count(swp_tb) <= 0)
 				continue;
 			return -ENOMEM;
@@ -2737,8 +2842,10 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 	 * allocations from this area (while holding swap_lock).
 	 */
 	for (i = prev + 1; i < si->max; i++) {
+		rcu_read_lock();
 		swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
 					i % SWAPFILE_CLUSTER);
+		rcu_read_unlock();
 		if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb))
 			break;
 		if ((i % LATENCY_LIMIT) == 0)
@@ -2977,6 +3084,11 @@ static int setup_swap_extents(struct swap_info_struct *sis,
 	struct inode *inode = mapping->host;
 	int ret;
 
+	if (sis->flags & SWP_VSWAP) {
+		*span = 0;
+		return 0;
+	}
+
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
@@ -3001,15 +3113,22 @@ static int setup_swap_extents(struct swap_info_struct *sis,
 
 static void _enable_swap_info(struct swap_info_struct *si)
 {
-	atomic_long_add(si->pages, &nr_swap_pages);
-	total_swap_pages += si->pages;
+	if (!swap_is_vswap(si)) {
+		atomic_long_add(si->pages, &nr_swap_pages);
+		total_swap_pages += si->pages;
+	}
 
 	assert_spin_locked(&swap_lock);
 
-	plist_add(&si->list, &swap_active_head);
-
-	/* Add back to available list */
-	add_to_avail_list(si, true);
+	/*
+	 * Vswap has no backing file and no swapoff support — keep it
+	 * off swap_active_head (used by swapoff filename lookup and
+	 * swap_sync_discard) and swap_avail_head (physical allocator).
+	 */
+	if (!swap_is_vswap(si)) {
+		plist_add(&si->list, &swap_active_head);
+		add_to_avail_list(si, true);
+	}
 }
 
 /*
@@ -3046,6 +3165,8 @@ static void wait_for_allocation(struct swap_info_struct *si)
 	struct swap_cluster_info *ci;
 
 	BUG_ON(si->flags & SWP_WRITEOK);
+	if (si->flags & SWP_VSWAP)
+		return;
 
 	for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
 		ci = swap_cluster_lock(si, offset);
@@ -3184,7 +3305,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	destroy_swap_extents(p, p->swap_file);
 
-	if (!(p->flags & SWP_SOLIDSTATE))
+	if (!(p->flags & SWP_VSWAP) &&
+	    !(p->flags & SWP_SOLIDSTATE))
 		atomic_dec(&nr_rotate_swap);
 
 	mutex_lock(&swapon_mutex);
@@ -3294,6 +3416,19 @@ static void swap_stop(struct seq_file *swap, void *v)
 	mutex_unlock(&swapon_mutex);
 }
 
+static const char *swap_type_str(struct swap_info_struct *si)
+{
+	struct file *file = si->swap_file;
+
+	if (si->flags & SWP_VSWAP)
+		return "vswap\t";
+
+	if (S_ISBLK(file_inode(file)->i_mode))
+		return "partition";
+
+	return "file\t";
+}
+
 static int swap_show(struct seq_file *swap, void *v)
 {
 	struct swap_info_struct *si = v;
@@ -3313,8 +3448,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	len = seq_file_path(swap, file, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
 			len < 40 ? 40 - len : 1, " ",
-			S_ISBLK(file_inode(file)->i_mode) ?
-				"partition" : "file\t",
+			swap_type_str(si),
 			bytes, bytes < 10000000 ? "\t" : "",
 			inuse, inuse < 10000000 ? "\t" : "",
 			si->prio);
@@ -3446,7 +3580,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
 	return 0;
 }
 
-
 /*
  * Find out how many pages are allowed for a single swap device. There
  * are two limiting factors:
@@ -3552,10 +3685,43 @@ static int setup_swap_clusters_info(struct swap_info_struct *si,
 				    unsigned long maxpages)
 {
 	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-	struct swap_cluster_info *cluster_info;
+	struct swap_cluster_info *cluster_info = NULL;
+	struct swap_cluster_info_dynamic *ci_dyn;
 	int err = -ENOMEM;
 	unsigned long i;
 
+	/* For SWP_VSWAP files, initialize Xarray pool instead of static array */
+	if (si->flags & SWP_VSWAP) {
+		/*
+		 * Pre-allocate cluster 0 and mark slot 0 (header page)
+		 * as bad so the allocator never hands out page offset 0.
+		 */
+		ci_dyn = kzalloc(sizeof(*ci_dyn), GFP_KERNEL);
+		if (!ci_dyn)
+			goto err;
+		spin_lock_init(&ci_dyn->ci.lock);
+		INIT_LIST_HEAD(&ci_dyn->ci.list);
+
+		nr_clusters = 0;
+		xa_init_flags(&si->cluster_info_pool, XA_FLAGS_ALLOC);
+		err = xa_insert(&si->cluster_info_pool, 0, ci_dyn, GFP_KERNEL);
+		if (err) {
+			kfree(ci_dyn);
+			goto err;
+		}
+
+		err = swap_cluster_setup_bad_slot(si, &ci_dyn->ci, 0, false);
+		if (err) {
+			xa_erase(&si->cluster_info_pool, 0);
+			swap_cluster_free_table(&ci_dyn->ci);
+			kfree(ci_dyn);
+			xa_destroy(&si->cluster_info_pool);
+			goto err;
+		}
+
+		goto setup_cluster_info;
+	}
+
 	cluster_info = kvzalloc_objs(*cluster_info, nr_clusters);
 	if (!cluster_info)
 		goto err;
@@ -3580,6 +3746,10 @@ static int setup_swap_clusters_info(struct swap_info_struct *si,
 	err = swap_cluster_setup_bad_slot(si, cluster_info, 0, false);
 	if (err)
 		goto err;
+
+	if (!swap_header)
+		goto setup_cluster_info;
+
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 
@@ -3599,6 +3769,7 @@ static int setup_swap_clusters_info(struct swap_info_struct *si,
 			goto err;
 	}
 
+setup_cluster_info:
 	INIT_LIST_HEAD(&si->free_clusters);
 	INIT_LIST_HEAD(&si->full_clusters);
 	INIT_LIST_HEAD(&si->discard_clusters);
@@ -3635,7 +3806,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	struct dentry *dentry;
 	int prio;
 	int error;
-	union swap_header *swap_header;
+	union swap_header *swap_header = NULL;
 	int nr_extents;
 	sector_t span;
 	unsigned long maxpages;
@@ -3709,7 +3880,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		goto bad_swap_unlock_inode;
 	}
 	swap_header = kmap_local_folio(folio, 0);
-
 	maxpages = read_swap_header(si, swap_header, inode);
 	if (unlikely(!maxpages)) {
 		error = -EINVAL;
@@ -3744,7 +3914,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	if (si->bdev && !bdev_rot(si->bdev)) {
 		si->flags |= SWP_SOLIDSTATE;
-	} else {
+	} else if (!(si->flags & SWP_SOLIDSTATE)) {
 		atomic_inc(&nr_rotate_swap);
 		inced_nr_rotate_swap = true;
 	}
@@ -3966,3 +4136,47 @@ static int __init swapfile_init(void)
 	return 0;
 }
 subsys_initcall(swapfile_init);
+
+#ifdef CONFIG_VSWAP
+struct swap_info_struct *vswap_si;
+
+static int __init vswap_init(void)
+{
+	struct swap_info_struct *si;
+	unsigned long maxpages;
+	int err;
+
+	si = alloc_swap_info();
+	if (IS_ERR(si))
+		return PTR_ERR(si);
+
+	maxpages = min(swapfile_maximum_size,
+		       ALIGN_DOWN((unsigned long)UINT_MAX, SWAPFILE_CLUSTER));
+	si->flags |= SWP_VSWAP | SWP_SOLIDSTATE | SWP_WRITEOK;
+	si->bdev = NULL;
+	si->max = maxpages;
+	si->pages = maxpages - 1;
+	si->prio = SHRT_MAX;
+	si->list.prio = -si->prio;
+	si->avail_list.prio = -si->prio;
+
+	err = setup_swap_clusters_info(si, NULL, maxpages);
+	if (err)
+		goto fail;
+
+	mutex_lock(&swapon_mutex);
+	enable_swap_info(si);
+	mutex_unlock(&swapon_mutex);
+
+	vswap_si = si;
+	pr_info("vswap: created virtual swap device (%lu pages)\n", maxpages);
+	return 0;
+
+fail:
+	spin_lock(&swap_lock);
+	si->flags = 0;
+	spin_unlock(&swap_lock);
+	return err;
+}
+late_initcall(vswap_init);
+#endif
diff --git a/mm/vswap.h b/mm/vswap.h
new file mode 100644
index 000000000000..094ff16cb5a4
--- /dev/null
+++ b/mm/vswap.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Virtual swap space
+ *
+ * Copyright (C) 2026 Nhat Pham
+ */
+#ifndef _MM_VSWAP_H
+#define _MM_VSWAP_H
+
+#include <linux/swap.h>
+
+#ifdef CONFIG_VSWAP
+
+extern struct swap_info_struct *vswap_si;
+
+static inline bool swap_is_vswap(struct swap_info_struct *si)
+{
+	return si->flags & SWP_VSWAP;
+}
+
+#else
+
+static inline bool swap_is_vswap(struct swap_info_struct *si)
+{
+	return false;
+}
+
+#endif /* CONFIG_VSWAP */
+#endif /* _MM_VSWAP_H */
diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..993406074d58 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -994,11 +994,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct swap_info_struct *si;
 	int ret = 0;
 
-	/* try to allocate swap cache folio */
 	si = get_swap_device(swpentry);
 	if (!si)
 		return -EEXIST;
 
+	if (si->flags & SWP_VSWAP) {
+		put_swap_device(si);
+		return -EINVAL;
+	}
+
+	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
 	folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
 				       NO_INTERLEAVE_INDEX);
@@ -1049,7 +1054,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	folio_set_reclaim(folio);
 
 	/* start writeback */
-	__swap_writepage(folio, NULL);
+	ret = __swap_writepage(folio, NULL);
+	WARN_ON_ONCE(ret);
 
 out:
 	if (ret) {
-- 
2.53.0-Meta


^ permalink raw reply related

* [RFC PATCH 2/5] mm, swap: support zswap and zeroswap as vswap backends
From: Nhat Pham @ 2026-05-28 21:29 UTC (permalink / raw)
  To: kasong
  Cc: Liam.Howlett, akpm, apopple, axelrasmussen, baohua, baolin.wang,
	bhe, byungchul, cgroups, chengming.zhou, chrisl, corbet, david,
	dev.jain, gourry, hannes, hughd, jannh, joshua.hahnjy, lance.yang,
	lenb, linux-doc, linux-kernel, linux-mm, linux-pm,
	lorenzo.stoakes, matthew.brost, mhocko, muchun.song, npache,
	nphamcs, pavel, peterx, peterz, pfalcato, rafael, rakie.kim,
	roman.gushchin, rppt, ryan.roberts, shakeel.butt, shikemeng,
	surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed, yuanchu,
	zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>

Build the virtual swap layer on top of the swap-table infrastructure.
Virtual swap entries decouple PTE swap entries from physical backing,
allowing pages to be compressed by zswap (or detected as zero-filled)
without pre-allocating a physical swap slot.

This patch only supports zswap and zero-page backends. If zswap_store
fails, the page stays dirty in the swap cache (AOP_WRITEPAGE_ACTIVATE)
— physical disk backing fallback comes in the next patch. Zswap
writeback of vswap-backed entries is also disabled — the shrinker
skips when no physical swap pages are available.

Suggested-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/zswap.h |   3 +
 mm/internal.h         |  20 ++-
 mm/madvise.c          |   2 +-
 mm/memcontrol.c       |   8 +-
 mm/memory.c           |  20 ++-
 mm/page_io.c          |  61 +++++--
 mm/swap.h             |   4 +-
 mm/swap_state.c       |   8 +
 mm/swap_table.h       |  53 ++++++
 mm/swapfile.c         | 375 +++++++++++++++++++++++++++++++++---------
 mm/vmscan.c           |   5 +-
 mm/vswap.h            | 292 +++++++++++++++++++++++++++++++-
 mm/zswap.c            | 106 +++++++-----
 13 files changed, 807 insertions(+), 150 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 30c193a1207e..4b4f211f3301 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -6,6 +6,7 @@
 #include <linux/mm_types.h>
 
 struct lruvec;
+struct zswap_entry;
 
 extern atomic_long_t zswap_stored_pages;
 
@@ -28,6 +29,7 @@ unsigned long zswap_total_pages(void);
 bool zswap_store(struct folio *folio);
 int zswap_load(struct folio *folio);
 void zswap_invalidate(swp_entry_t swp);
+void zswap_entry_free(struct zswap_entry *entry);
 int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
@@ -50,6 +52,7 @@ static inline int zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(swp_entry_t swp) {}
+static inline void zswap_entry_free(struct zswap_entry *entry) {}
 static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
 	return 0;
diff --git a/mm/internal.h b/mm/internal.h
index 7646ecb9d621..23ea4c8172df 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,6 +16,7 @@
 #include <linux/pagewalk.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include "vswap.h"
 #include <linux/leafops.h>
 #include <linux/tracepoint-defs.h>
 
@@ -436,6 +437,9 @@ static inline pte_t pte_next_swp_offset(pte_t pte)
  * @start_ptep: Page table pointer for the first entry.
  * @max_nr: The maximum number of table entries to consider.
  * @pte: Page table entry for the first entry.
+ * @free_batch: True when the batch is for a free path. Skips the
+ *              vswap uniform-backing check (which is only relevant
+ *              for swapin batches).
  *
  * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
  * containing swap entries all with consecutive offsets and targeting the same
@@ -446,11 +450,14 @@ static inline pte_t pte_next_swp_offset(pte_t pte)
  *
  * Return: the number of table entries in the batch.
  */
-static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte,
+				 bool free_batch)
 {
 	pte_t expected_pte = pte_next_swp_offset(pte);
 	const pte_t *end_ptep = start_ptep + max_nr;
 	pte_t *ptep = start_ptep + 1;
+	swp_entry_t entry __maybe_unused;
+	int nr;
 
 	VM_WARN_ON(max_nr < 1);
 	VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte)));
@@ -464,7 +471,16 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
 		ptep++;
 	}
 
-	return ptep - start_ptep;
+	nr = ptep - start_ptep;
+#ifdef CONFIG_VSWAP
+	if (!free_batch) {
+		entry = softleaf_from_pte(ptep_get(start_ptep));
+		if (nr > 1 && swap_is_vswap(__swap_entry_to_info(entry)) &&
+		    !vswap_can_swapin_thp(entry, nr))
+			return 1;
+	}
+#endif
+	return nr;
 }
 #endif /* CONFIG_MMU */
 
diff --git a/mm/madvise.c b/mm/madvise.c
index cd9bb077072c..75ec10fbd61a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -692,7 +692,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
 			if (softleaf_is_swap(entry)) {
 				max_nr = (end - addr) / PAGE_SIZE;
-				nr = swap_pte_batch(pte, max_nr, ptent);
+				nr = swap_pte_batch(pte, max_nr, ptent, true);
 				nr_swap -= nr;
 				swap_put_entries_direct(entry, nr);
 				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 039e9bc8971c..a3ad83c229f7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -48,6 +48,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swapops.h>
+#include <linux/zswap.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
@@ -5538,8 +5539,13 @@ void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
 
 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 {
-	long nr_swap_pages = get_nr_swap_pages();
+	long nr_swap_pages;
 
+	/* vswap provides unbounded virtual swap when zswap is enabled */
+	if (IS_ENABLED(CONFIG_VSWAP) && zswap_is_enabled())
+		return PAGE_COUNTER_MAX;
+
+	nr_swap_pages = get_nr_swap_pages();
 	if (mem_cgroup_disabled() || do_memsw_account())
 		return nr_swap_pages;
 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
diff --git a/mm/memory.c b/mm/memory.c
index 7c020995eafc..c3050e49b086 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1764,7 +1764,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		if (!should_zap_cows(details))
 			return 1;
 
-		nr = swap_pte_batch(pte, max_nr, ptent);
+		nr = swap_pte_batch(pte, max_nr, ptent, true);
 		rss[MM_SWAPENTS] -= nr;
 		swap_put_entries_direct(entry, nr);
 	} else if (softleaf_is_migration(entry)) {
@@ -4630,7 +4630,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 	 * from different backends. And they are likely corner cases. Similar
 	 * things might be added once zswap support large folios.
 	 */
-	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
+	if (swap_pte_batch(ptep, nr_pages, pte, false) != nr_pages)
 		return false;
 	return true;
 }
@@ -4675,15 +4675,19 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		return 0;
 
+	entry = softleaf_from_pte(vmf->orig_pte);
+
 	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
+	 * A large swapped out folio could be partially or fully in zswap.
+	 * With vswap, vswap_can_swapin_thp() (via swap_pte_batch) lets
+	 * THP swapin through only for backings that don't need per-page
+	 * decompression. For non-vswap entries we still need the
+	 * zswap_never_enabled() bail — zswap_load rejects large folios
+	 * with -EINVAL, which would SIGBUS the fault.
 	 */
-	if (!zswap_never_enabled())
+	if (!swap_is_vswap(__swap_entry_to_info(entry)) && !zswap_never_enabled())
 		return 0;
 
-	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
 	 * and suitable for swapping THP.
@@ -4942,7 +4946,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		folio_ptep = vmf->pte - idx;
 		folio_pte = ptep_get(folio_ptep);
 		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
-		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
+		    swap_pte_batch(folio_ptep, nr, folio_pte, false) != nr)
 			goto check_folio;
 
 		page_idx = idx;
diff --git a/mm/page_io.c b/mm/page_io.c
index 8126be6e4cfb..b3c7e56c8eed 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -27,6 +27,7 @@
 #include <linux/zswap.h>
 #include "swap.h"
 #include "swap_table.h"
+#include "vswap.h"
 
 static void __end_swap_bio_write(struct bio *bio)
 {
@@ -204,19 +205,28 @@ static bool is_folio_zero_filled(struct folio *folio)
 
 static void swap_zeromap_folio_set(struct folio *folio)
 {
+	struct swap_info_struct *si = __swap_entry_to_info(folio->swap);
 	struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
 	int nr_pages = folio_nr_pages(folio);
 	struct swap_cluster_info *ci;
+	unsigned int voff, i;
 	swp_entry_t entry;
-	unsigned int i;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 
 	ci = swap_cluster_get_and_lock(folio);
-	for (i = 0; i < folio_nr_pages(folio); i++) {
-		entry = page_swap_entry(folio_page(folio, i));
-		__swap_table_set_zero(ci, swp_cluster_offset(entry));
+	if (swap_is_vswap(si)) {
+		voff = swp_cluster_offset(folio->swap);
+		/* Free any prior backing (e.g. ZSWAP entry from earlier swapout) */
+		vswap_release_backing(ci, voff, nr_pages);
+		for (i = 0; i < nr_pages; i++)
+			vswap_set_zero(ci, voff + i);
+	} else {
+		for (i = 0; i < nr_pages; i++) {
+			entry = page_swap_entry(folio_page(folio, i));
+			__swap_table_set_zero(ci, swp_cluster_offset(entry));
+		}
 	}
 	swap_cluster_unlock(ci);
 
@@ -282,6 +292,9 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	 */
 	swap_zeromap_folio_clear(folio);
 
+	if (swap_is_vswap(__swap_entry_to_info(folio->swap)))
+		vswap_prepare_writeout(folio->swap, folio);
+
 	if (zswap_store(folio)) {
 		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
 		goto out_unlock;
@@ -295,6 +308,11 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	}
 	rcu_read_unlock();
 
+	if (swap_is_vswap(__swap_entry_to_info(folio->swap))) {
+		folio_mark_dirty(folio);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
+
 	return __swap_writepage(folio, swap_plug);
 out_unlock:
 	folio_unlock(folio);
@@ -537,23 +555,40 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
 static int swap_zeromap_batch(swp_entry_t entry, int max_nr,
 			      bool *is_zerop)
 {
-	int i;
-	bool is_zero;
-	unsigned int ci_start = swp_cluster_offset(entry);
+	struct swap_info_struct *si = __swap_entry_to_info(entry);
 	struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+	unsigned int ci_start = swp_cluster_offset(entry), ci_off, ci_end;
+	bool is_zero;
 
 	VM_WARN_ON_ONCE(ci_start + max_nr > SWAPFILE_CLUSTER);
 
+	ci_off = ci_start;
+	ci_end = ci_off + max_nr;
+
+	if (swap_is_vswap(si)) {
+		spin_lock(&ci->lock);
+		is_zero = vswap_test_zero(ci, ci_off);
+		if (is_zerop)
+			*is_zerop = is_zero;
+		while (++ci_off < ci_end) {
+			if (is_zero != vswap_test_zero(ci, ci_off))
+				break;
+		}
+		spin_unlock(&ci->lock);
+		return ci_off - ci_start;
+	}
+
 	rcu_read_lock();
-	is_zero = __swap_table_test_zero(ci, ci_start);
-	for (i = 1; i < max_nr; i++)
-		if (is_zero != __swap_table_test_zero(ci, ci_start + i))
-			break;
-	rcu_read_unlock();
+	is_zero = __swap_table_test_zero(ci, ci_off);
 	if (is_zerop)
 		*is_zerop = is_zero;
+	while (++ci_off < ci_end) {
+		if (is_zero != __swap_table_test_zero(ci, ci_off))
+			break;
+	}
+	rcu_read_unlock();
 
-	return i;
+	return ci_off - ci_start;
 }
 
 static bool swap_read_folio_zeromap(struct folio *folio)
diff --git a/mm/swap.h b/mm/swap.h
index 479ee5871cb9..640413e30880 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -69,7 +69,9 @@ struct swap_cluster_info_dynamic {
 	struct swap_cluster_info ci;	/* Underlying cluster info */
 	unsigned int index;		/* for cluster_index() */
 	struct rcu_head rcu;		/* For kfree_rcu deferred free */
-	/* Backend pointers (virtual_table) added in a later patch. */
+#ifdef CONFIG_VSWAP
+	atomic_long_t *virtual_table;	/* Backing pointers for vswap slots */
+#endif
 };
 
 /* All on-list cluster must have a non-zero flag. */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b063c47138c5..6bfa185b7d0f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "swap_table.h"
 #include "swap.h"
+#include "vswap.h"
 
 /*
  * swapper_space is a fiction, retained to simplify the path through
@@ -692,6 +693,13 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
 	if (IS_ERR(folio))
 		return folio;
 
+	if (folio_test_large(folio) && swap_is_vswap(__swap_entry_to_info(folio->swap)) &&
+	    !vswap_can_swapin_thp(folio->swap, folio_nr_pages(folio))) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return NULL;
+	}
+
 	swap_read_folio(folio, NULL);
 	return folio;
 }
diff --git a/mm/swap_table.h b/mm/swap_table.h
index fd7f0fb9836a..b0e7ef9c966b 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -6,6 +6,8 @@
 #include <linux/atomic.h>
 #include "swap.h"
 
+struct zswap_entry;
+
 /* A typical flat array in each cluster as swap table */
 struct swap_table {
 	atomic_long_t entries[SWAPFILE_CLUSTER];
@@ -368,4 +370,55 @@ static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
 }
 #endif
 
+/*
+ * Pointer-tagged swap table entry: rmap for vswap-backing physical slots.
+ *
+ * On physical clusters, a Pointer-tagged entry stores the vswap entry
+ * that owns this physical slot (the reverse map). The top bit is reserved
+ * as a cache-only flag, set when vswap swap_count drops to 0 but the
+ * folio is still in swap cache.
+ *
+ *   Pointer:  |C|--- vswap entry ---|100|
+ *             C = SWP_RMAP_CACHE_ONLY (bit 63)
+ */
+#ifdef CONFIG_VSWAP
+#define SWP_TB_PTR_MARK		0b100UL
+#define SWP_TB_PTR_MARK_MASK	0b111UL
+#define SWP_RMAP_CACHE_ONLY	(1UL << (BITS_PER_LONG - 1))
+#define SWP_RMAP_ENTRY_MASK	(~(SWP_RMAP_CACHE_ONLY | SWP_TB_PTR_MARK_MASK))
+
+static inline bool swp_tb_is_pointer(unsigned long swp_tb)
+{
+	return (swp_tb & SWP_TB_PTR_MARK_MASK) == SWP_TB_PTR_MARK;
+}
+
+static inline unsigned long swp_entry_to_swp_tb_ptr(swp_entry_t entry)
+{
+	return (entry.val << 3) | SWP_TB_PTR_MARK;
+}
+
+static inline swp_entry_t swp_tb_ptr_to_swp_entry(unsigned long swp_tb)
+{
+	swp_entry_t entry;
+
+	VM_WARN_ON(!swp_tb_is_pointer(swp_tb));
+	entry.val = (swp_tb & SWP_RMAP_ENTRY_MASK) >> 3;
+	return entry;
+}
+#else
+static inline bool swp_tb_is_pointer(unsigned long swp_tb)
+{
+	return false;
+}
+static inline unsigned long swp_entry_to_swp_tb_ptr(swp_entry_t entry)
+{
+	return 0;
+}
+static inline swp_entry_t swp_tb_ptr_to_swp_entry(unsigned long swp_tb)
+{
+	return (swp_entry_t){};
+}
+
+#endif /* CONFIG_VSWAP */
+
 #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f6d2529159ff..c90d83fd628a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -131,6 +131,26 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
 	.lock = INIT_LOCAL_LOCK(),
 };
 
+#ifdef CONFIG_VSWAP
+struct percpu_vswap_cluster {
+	unsigned long offset[SWAP_NR_ORDERS];
+	local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_vswap_cluster, percpu_vswap_cluster) = {
+	.offset = { [0 ... SWAP_NR_ORDERS - 1] = SWAP_ENTRY_INVALID },
+	.lock = INIT_LOCAL_LOCK(),
+};
+
+static bool vswap_alloc(struct folio *folio);
+static void vswap_free_cluster(struct swap_info_struct *si,
+			       struct swap_cluster_info *ci);
+#else
+static inline bool vswap_alloc(struct folio *folio) { return false; }
+static inline void vswap_free_cluster(struct swap_info_struct *si,
+				      struct swap_cluster_info *ci) {}
+#endif
+
 /* May return NULL on invalid type, caller must check for NULL return */
 static struct swap_info_struct *swap_type_to_info(int type)
 {
@@ -538,8 +558,14 @@ swap_cluster_populate(struct swap_info_struct *si,
 	 * Only cluster isolation from the allocator does table allocation.
 	 * Swap allocator uses percpu clusters and holds the local lock.
 	 */
-	lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
-	if (!(si->flags & SWP_SOLIDSTATE))
+#ifdef CONFIG_VSWAP
+	if (swap_is_vswap(si))
+		lockdep_assert_held(&this_cpu_ptr(&percpu_vswap_cluster)->lock);
+	else
+#endif
+	if (si->flags & SWP_SOLIDSTATE)
+		lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+	else
 		lockdep_assert_held(&si->global_cluster_lock);
 	lockdep_assert_held(&ci->lock);
 
@@ -555,7 +581,12 @@ swap_cluster_populate(struct swap_info_struct *si,
 	spin_unlock(&ci->lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster_lock);
-	local_unlock(&percpu_swap_cluster.lock);
+#ifdef CONFIG_VSWAP
+	if (swap_is_vswap(si))
+		local_unlock(&percpu_vswap_cluster.lock);
+	else
+#endif
+		local_unlock(&percpu_swap_cluster.lock);
 
 	ret = swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC |
 					   GFP_KERNEL);
@@ -568,7 +599,12 @@ swap_cluster_populate(struct swap_info_struct *si,
 	 * could happen with ignoring the percpu cluster is fragmentation,
 	 * which is acceptable since this fallback and race is rare.
 	 */
-	local_lock(&percpu_swap_cluster.lock);
+#ifdef CONFIG_VSWAP
+	if (swap_is_vswap(si))
+		local_lock(&percpu_vswap_cluster.lock);
+	else
+#endif
+		local_lock(&percpu_swap_cluster.lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_lock(&si->global_cluster_lock);
 	spin_lock(&ci->lock);
@@ -738,19 +774,12 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
 		return;
 	}
 
+	/*
+	 * Vswap dynamic clusters need explicit cleanup (xarray erase,
+	 * kfree_rcu, virtual_table free if allocated).
+	 */
 	if (si->flags & SWP_VSWAP) {
-		struct swap_cluster_info_dynamic *ci_dyn;
-
-		ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
-		if (ci->flags != CLUSTER_FLAG_NONE) {
-			spin_lock(&si->lock);
-			list_del(&ci->list);
-			spin_unlock(&si->lock);
-		}
-		swap_cluster_free_table(ci);
-		xa_erase(&si->cluster_info_pool, ci_dyn->index);
-		ci->flags = CLUSTER_FLAG_DEAD;
-		kfree_rcu(ci_dyn, rcu);
+		vswap_free_cluster(si, ci);
 		return;
 	}
 
@@ -874,6 +903,8 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 	spin_unlock(&ci->lock);
 	do {
 		swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+		if (swp_tb_is_pointer(swp_tb))
+			break;
 		if (swp_tb_get_count(swp_tb))
 			break;
 		if (swp_tb_is_folio(swp_tb))
@@ -946,47 +977,29 @@ static bool cluster_scan_range(struct swap_info_struct *si,
 
 static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
 					 struct swap_cluster_info *ci,
+					 unsigned int ci_off,
+					 unsigned long swp_tb,
 					 struct folio *folio,
-					 unsigned int ci_off)
+					 unsigned int order)
 {
-	unsigned int order;
-	unsigned long nr_pages;
+	unsigned long nr_pages = 1 << order;
 
 	lockdep_assert_held(&ci->lock);
 
 	if (!(si->flags & SWP_WRITEOK))
 		return false;
 
-	/*
-	 * All mm swap allocation starts with a folio (folio_alloc_swap),
-	 * it's also the only allocation path for large orders allocation.
-	 * Such swap slots starts with count == 0 and will be increased
-	 * upon folio unmap.
-	 *
-	 * Else, it's a exclusive order 0 allocation for hibernation.
-	 * The slot starts with count == 1 and never increases.
-	 */
-	if (likely(folio)) {
-		order = folio_order(folio);
-		nr_pages = 1 << order;
-		swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
+	swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
+
+	if (swp_tb_is_folio(swp_tb))
 		__swap_cache_add_folio(ci, folio, swp_entry(si->type,
 							    ci_off + cluster_offset(si, ci)));
-	} else if (IS_ENABLED(CONFIG_HIBERNATION)) {
-		order = 0;
-		nr_pages = 1;
-		swap_cluster_assert_empty(ci, ci_off, 1, false);
-		/* Fake shadow placeholder with no flag, hibernation does not use the zeromap */
-		__swap_table_set(ci, ci_off, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0), 1));
-	} else {
-		/* Allocation without folio is only possible with hibernation */
-		WARN_ON_ONCE(1);
-		return false;
-	}
+	else
+		__swap_table_set(ci, ci_off, swp_tb);
 
 	/*
 	 * The first allocation in a cluster makes the
-	 * cluster exclusive to this order
+	 * cluster exclusive to this order.
 	 */
 	if (cluster_is_empty(ci))
 		ci->order = order;
@@ -999,11 +1012,13 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
 /* Try use a new cluster for current CPU and allocate from it. */
 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 					    struct swap_cluster_info *ci,
-					    struct folio *folio, unsigned long offset)
+					    struct folio *folio,
+					    unsigned long offset,
+					    unsigned long swp_tb)
 {
 	unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
 	unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
-	unsigned int order = likely(folio) ? folio_order(folio) : 0;
+	unsigned int order = folio ? folio_order(folio) : 0;
 	unsigned long end = start + SWAPFILE_CLUSTER;
 	unsigned int nr_pages = 1 << order;
 	bool need_reclaim, ret, usable;
@@ -1029,7 +1044,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 			if (!ret)
 				continue;
 		}
-		if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER))
+		if (!__swap_cluster_alloc_entries(si, ci, offset % SWAPFILE_CLUSTER,
+					swp_tb, folio, order))
 			break;
 		found = offset;
 		offset += nr_pages;
@@ -1042,6 +1058,11 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 		relocate_cluster(si, ci);
 		swap_cluster_unlock(ci);
 	}
+#ifdef CONFIG_VSWAP
+	if (swap_is_vswap(si)) {
+		this_cpu_write(percpu_vswap_cluster.offset[order], next);
+	} else
+#endif
 	if (si->flags & SWP_SOLIDSTATE) {
 		this_cpu_write(percpu_swap_cluster.offset[order], next);
 		this_cpu_write(percpu_swap_cluster.si[order], si);
@@ -1054,7 +1075,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
 					 struct list_head *list,
 					 struct folio *folio,
-					 bool scan_all)
+					 bool scan_all,
+					 unsigned long swp_tb)
 {
 	unsigned int found = SWAP_ENTRY_INVALID;
 
@@ -1065,7 +1087,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
 		if (!ci)
 			break;
 		offset = cluster_offset(si, ci);
-		found = alloc_swap_scan_cluster(si, ci, folio, offset);
+		found = alloc_swap_scan_cluster(si, ci, folio, offset, swp_tb);
 		if (found)
 			break;
 	} while (scan_all);
@@ -1074,7 +1096,8 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
 }
 
 static unsigned int alloc_swap_scan_dynamic(struct swap_info_struct *si,
-					    struct folio *folio)
+					    struct folio *folio,
+					    unsigned long swp_tb)
 {
 	struct swap_cluster_info_dynamic *ci_dyn;
 	struct swap_cluster_info *ci;
@@ -1094,10 +1117,17 @@ static unsigned int alloc_swap_scan_dynamic(struct swap_info_struct *si,
 		return SWAP_ENTRY_INVALID;
 	}
 
+	if (vswap_cluster_alloc_vtable(ci_dyn)) {
+		swap_cluster_free_table(&ci_dyn->ci);
+		kfree(ci_dyn);
+		return SWAP_ENTRY_INVALID;
+	}
+
 	if (xa_alloc(&si->cluster_info_pool, &ci_dyn->index, ci_dyn,
 		     XA_LIMIT(1, DIV_ROUND_UP(si->max, SWAPFILE_CLUSTER) - 1),
 		     GFP_ATOMIC)) {
 		swap_cluster_free_table(&ci_dyn->ci);
+		vswap_cluster_free_vtable(&ci_dyn->ci);
 		kfree(ci_dyn);
 		return SWAP_ENTRY_INVALID;
 	}
@@ -1105,7 +1135,7 @@ static unsigned int alloc_swap_scan_dynamic(struct swap_info_struct *si,
 	ci = &ci_dyn->ci;
 	spin_lock(&ci->lock);
 	offset = cluster_offset(si, ci);
-	return alloc_swap_scan_cluster(si, ci, folio, offset);
+	return alloc_swap_scan_cluster(si, ci, folio, offset, swp_tb);
 }
 
 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
@@ -1166,18 +1196,20 @@ static void swap_reclaim_work(struct work_struct *work)
  * Try to allocate swap entries with specified order and try set a new
  * cluster for current CPU too.
  */
-static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
-					      struct folio *folio)
+static unsigned long cluster_alloc_swap_entry_tb(struct swap_info_struct *si,
+						 struct folio *folio,
+						 unsigned long swp_tb)
 {
+	unsigned int order = folio ? folio_order(folio) : 0;
 	struct swap_cluster_info *ci;
-	unsigned int order = likely(folio) ? folio_order(folio) : 0;
 	unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
 
 	/*
-	 * Swapfile is not block device so unable
-	 * to allocate large entries.
+	 * File-based swap can't do large contiguous IO. vswap has no IO
+	 * here (large entries are fine; THP swapin uses vswap_can_swapin_thp
+	 * to gate based on backing).
 	 */
-	if (order && !(si->flags & SWP_BLKDEV))
+	if (order && !(si->flags & SWP_BLKDEV) && !swap_is_vswap(si))
 		return 0;
 
 	if (!(si->flags & SWP_SOLIDSTATE)) {
@@ -1192,7 +1224,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 		if (cluster_is_usable(ci, order)) {
 			if (cluster_is_empty(ci))
 				offset = cluster_offset(si, ci);
-			found = alloc_swap_scan_cluster(si, ci, folio, offset);
+			found = alloc_swap_scan_cluster(si, ci, folio, offset, swp_tb);
 		} else {
 			swap_cluster_unlock(ci);
 		}
@@ -1206,25 +1238,25 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 	 * to spread out the writes.
 	 */
 	if (si->flags & SWP_PAGE_DISCARD) {
-		found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
+		found = alloc_swap_scan_list(si, &si->free_clusters, folio, false, swp_tb);
 		if (found)
 			goto done;
 	}
 
 	if (order < PMD_ORDER) {
-		found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true);
+		found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true, swp_tb);
 		if (found)
 			goto done;
 	}
 
 	if (si->flags & SWP_VSWAP) {
-		found = alloc_swap_scan_dynamic(si, folio);
+		found = alloc_swap_scan_dynamic(si, folio, swp_tb);
 		if (found)
 			goto done;
 	}
 
 	if (!(si->flags & SWP_PAGE_DISCARD)) {
-		found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
+		found = alloc_swap_scan_list(si, &si->free_clusters, folio, false, swp_tb);
 		if (found)
 			goto done;
 	}
@@ -1240,7 +1272,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 		 * failure is not critical. Scanning one cluster still
 		 * keeps the list rotated and reclaimed (for clean swap cache).
 		 */
-		found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
+		found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false, swp_tb);
 		if (found)
 			goto done;
 	}
@@ -1254,11 +1286,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 		 * Clusters here have at least one usable slots and can't fail order 0
 		 * allocation, but reclaim may drop si->lock and race with another user.
 		 */
-		found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true);
+		found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true, swp_tb);
 		if (found)
 			goto done;
 
-		found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true);
+		found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true, swp_tb);
 		if (found)
 			goto done;
 	}
@@ -1394,7 +1426,8 @@ static void swap_range_alloc(struct swap_info_struct *si,
 		if (vm_swap_full())
 			schedule_work(&si->reclaim_work);
 	}
-	atomic_long_sub(nr_entries, &nr_swap_pages);
+	if (!swap_is_vswap(si))
+		atomic_long_sub(nr_entries, &nr_swap_pages);
 }
 
 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -1404,8 +1437,10 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
 	unsigned int i;
 
-	for (i = 0; i < nr_entries; i++)
-		zswap_invalidate(swp_entry(si->type, offset + i));
+	if (!swap_is_vswap(si)) {
+		for (i = 0; i < nr_entries; i++)
+			zswap_invalidate(swp_entry(si->type, offset + i));
+	}
 
 	if (si->flags & SWP_BLKDEV)
 		swap_slot_free_notify =
@@ -1424,7 +1459,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 	 * only after the above cleanups are done.
 	 */
 	smp_wmb();
-	atomic_long_add(nr_entries, &nr_swap_pages);
+	if (!swap_is_vswap(si))
+		atomic_long_add(nr_entries, &nr_swap_pages);
 	swap_usage_sub(si, nr_entries);
 }
 
@@ -1452,12 +1488,15 @@ static bool get_swap_device_info(struct swap_info_struct *si)
  * Fast path try to get swap entries with specified order from current
  * CPU's swap entry pool (a cluster).
  */
-static bool swap_alloc_fast(struct folio *folio)
+static swp_entry_t swap_alloc_fast(struct folio *folio)
 {
 	unsigned int order = folio_order(folio);
 	struct swap_cluster_info *ci;
 	struct swap_info_struct *si;
-	unsigned int offset;
+	unsigned long offset, swp_tb;
+	unsigned long found = 0;
+
+	lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
 
 	/*
 	 * Once allocated, swap_info_struct will never be completely freed,
@@ -1466,25 +1505,32 @@ static bool swap_alloc_fast(struct folio *folio)
 	si = this_cpu_read(percpu_swap_cluster.si[order]);
 	offset = this_cpu_read(percpu_swap_cluster.offset[order]);
 	if (!si || !offset || !get_swap_device_info(si))
-		return false;
+		return (swp_entry_t){};
+
+	swp_tb = folio_to_swp_tb(folio, 0);
 
 	ci = swap_cluster_lock(si, offset);
 	if (ci && cluster_is_usable(ci, order)) {
 		if (cluster_is_empty(ci))
 			offset = cluster_offset(si, ci);
-		alloc_swap_scan_cluster(si, ci, folio, offset);
+		found = alloc_swap_scan_cluster(si, ci, folio, offset, swp_tb);
 	} else if (ci) {
 		swap_cluster_unlock(ci);
 	}
 
 	put_swap_device(si);
-	return folio_test_swapcache(folio);
+	if (found)
+		return swp_entry(si->type, found);
+	return (swp_entry_t){};
 }
 
 /* Rotate the device and switch to a new cluster */
-static void swap_alloc_slow(struct folio *folio)
+static swp_entry_t swap_alloc_slow(struct folio *folio)
 {
 	struct swap_info_struct *si, *next;
+	unsigned long swp_tb, found;
+
+	swp_tb = folio_to_swp_tb(folio, 0);
 
 	spin_lock(&swap_avail_lock);
 start_over:
@@ -1493,12 +1539,13 @@ static void swap_alloc_slow(struct folio *folio)
 		plist_requeue(&si->avail_list, &swap_avail_head);
 		spin_unlock(&swap_avail_lock);
 		if (get_swap_device_info(si)) {
-			cluster_alloc_swap_entry(si, folio);
+			found = cluster_alloc_swap_entry_tb(si, folio,
+							    swp_tb);
 			put_swap_device(si);
-			if (folio_test_swapcache(folio))
-				return;
+			if (found)
+				return swp_entry(si->type, found);
 			if (folio_test_large(folio))
-				return;
+				return (swp_entry_t){};
 		}
 
 		spin_lock(&swap_avail_lock);
@@ -1516,6 +1563,7 @@ static void swap_alloc_slow(struct folio *folio)
 			goto start_over;
 	}
 	spin_unlock(&swap_avail_lock);
+	return (swp_entry_t){};
 }
 
 /*
@@ -1695,6 +1743,15 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
 	if (!need_reclaim || !reclaim_cache)
 		return;
 
+	/*
+	 * Vswap space is dynamically allocated and effectively infinite —
+	 * there is no benefit to reclaiming swap cache entries to free
+	 * virtual slots. Physical slot reclaim is handled separately via
+	 * SWP_RMAP_CACHE_ONLY on the physical cluster.
+	 */
+	if (swap_is_vswap(si))
+		return;
+
 	do {
 		nr_reclaimed = __try_to_reclaim_swap(si, offset,
 						     TTRS_UNMAPPED | TTRS_FULL);
@@ -1800,6 +1857,44 @@ static int swap_dup_entries_cluster(struct swap_info_struct *si,
  * Context: Caller needs to hold the folio lock.
  * Return: Whether the folio was added to the swap cache.
  */
+#ifdef CONFIG_VSWAP
+static bool vswap_alloc(struct folio *folio)
+{
+	unsigned int order = folio_order(folio);
+	struct swap_cluster_info *ci;
+	unsigned long offset;
+
+	local_lock(&percpu_vswap_cluster.lock);
+	offset = this_cpu_read(percpu_vswap_cluster.offset[order]);
+
+	if (offset != SWAP_ENTRY_INVALID) {
+		ci = swap_cluster_lock(vswap_si, offset);
+		if (ci && cluster_is_usable(ci, order)) {
+			if (cluster_is_empty(ci))
+				offset = cluster_offset(vswap_si, ci);
+			alloc_swap_scan_cluster(vswap_si, ci, folio,
+					       offset, folio_to_swp_tb(folio, 0));
+		} else if (ci) {
+			swap_cluster_unlock(ci);
+		}
+	}
+
+	if (!folio_test_swapcache(folio))
+		cluster_alloc_swap_entry_tb(vswap_si, folio,
+					    folio_to_swp_tb(folio, 0));
+
+	if (folio_test_swapcache(folio)) {
+		/* alloc_swap_scan_cluster updated percpu offset already */
+		local_unlock(&percpu_vswap_cluster.lock);
+		return true;
+	}
+
+	this_cpu_write(percpu_vswap_cluster.offset[order], SWAP_ENTRY_INVALID);
+	local_unlock(&percpu_vswap_cluster.lock);
+	return false;
+}
+#endif
+
 int folio_alloc_swap(struct folio *folio)
 {
 	unsigned int order = folio_order(folio);
@@ -1827,12 +1922,21 @@ int folio_alloc_swap(struct folio *folio)
 		}
 	}
 
+	/*
+	 * Skip vswap when zswap is disabled — without zswap, vswap entries
+	 * have nowhere to go on writeout (no physical fallback yet; that
+	 * arrives in the next patch).
+	 */
+	if (zswap_is_enabled() && vswap_alloc(folio))
+		goto done;
+
 again:
 	local_lock(&percpu_swap_cluster.lock);
-	if (!swap_alloc_fast(folio))
+	if (!swap_alloc_fast(folio).val)
 		swap_alloc_slow(folio);
 	local_unlock(&percpu_swap_cluster.lock);
 
+done:
 	if (!order && unlikely(!folio_test_swapcache(folio))) {
 		if (swap_sync_discard())
 			goto again;
@@ -1848,6 +1952,106 @@ int folio_alloc_swap(struct folio *folio)
 	return 0;
 }
 
+#ifdef CONFIG_VSWAP
+static void vswap_free_cluster(struct swap_info_struct *si,
+			       struct swap_cluster_info *ci)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	if (ci->flags != CLUSTER_FLAG_NONE) {
+		spin_lock(&si->lock);
+		list_del(&ci->list);
+		spin_unlock(&si->lock);
+	}
+	swap_cluster_free_table(ci);
+	vswap_cluster_free_vtable(ci);
+	xa_erase(&si->cluster_info_pool, ci_dyn->index);
+	ci->flags = CLUSTER_FLAG_DEAD;
+	kfree_rcu(ci_dyn, rcu);
+}
+
+void vswap_release_backing(struct swap_cluster_info *ci,
+			   unsigned int ci_start, unsigned int nr)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	unsigned int ci_off;
+	unsigned long vt;
+
+	lockdep_assert_held(&ci->lock);
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+
+	for (ci_off = ci_start; ci_off < ci_start + nr; ci_off++) {
+		vt = __vtable_get(ci_dyn, ci_off);
+
+		switch (vtable_type(vt)) {
+		case VSWAP_ZSWAP:
+			if (vtable_to_zswap(vt))
+				zswap_entry_free(vtable_to_zswap(vt));
+			break;
+		case VSWAP_SWAPFILE:
+		case VSWAP_FOLIO:
+		case VSWAP_ZERO:
+		case VSWAP_NONE:
+			break;
+		}
+
+		__vtable_set(ci_dyn, ci_off, vtable_mk_none());
+	}
+}
+
+void vswap_store_folio(swp_entry_t entry, struct folio *folio)
+{
+	struct swap_cluster_info *ci;
+	struct swap_cluster_info_dynamic *ci_dyn;
+	int i, nr = folio_nr_pages(folio);
+	unsigned int voff;
+
+	ci = __swap_entry_to_cluster(entry);
+	if (!ci)
+		return;
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	voff = swp_cluster_offset(entry);
+
+	spin_lock(&ci->lock);
+	vswap_release_backing(ci, voff, nr);
+	for (i = 0; i < nr; i++)
+		__vtable_set(ci_dyn, voff + i, vtable_mk_folio(folio));
+	spin_unlock(&ci->lock);
+}
+
+void vswap_prepare_writeout(swp_entry_t entry, struct folio *folio)
+{
+	struct swap_cluster_info *ci;
+	struct swap_cluster_info_dynamic *ci_dyn;
+	int i, nr = folio_nr_pages(folio);
+	unsigned int voff;
+	unsigned long vt;
+	enum vswap_backing_type type;
+
+	ci = __swap_entry_to_cluster(entry);
+	if (!ci)
+		return;
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	voff = swp_cluster_offset(entry);
+
+	spin_lock(&ci->lock);
+	vt = __vtable_get(ci_dyn, voff);
+	type = vtable_type(vt);
+
+	if (type == VSWAP_SWAPFILE || type == VSWAP_FOLIO || type == VSWAP_NONE) {
+		spin_unlock(&ci->lock);
+		return;
+	}
+
+	vswap_release_backing(ci, voff, nr);
+	for (i = 0; i < nr; i++)
+		__vtable_set(ci_dyn, voff + i, vtable_mk_folio(folio));
+	spin_unlock(&ci->lock);
+}
+
+#endif /* CONFIG_VSWAP */
+
 /**
  * folio_dup_swap() - Increase swap count of swap entries of a folio.
  * @folio: folio with swap entries bounded.
@@ -1989,6 +2193,9 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 
 	VM_WARN_ON(ci->count < nr_pages);
 
+	if (swap_is_vswap(si))
+		vswap_release_backing(ci, ci_start, nr_pages);
+
 	ci->count -= nr_pages;
 	do {
 		old_tb = __swap_table_get(ci, ci_off);
@@ -2240,12 +2447,15 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
 	if (pcp_si == si && pcp_offset) {
 		ci = swap_cluster_lock(si, pcp_offset);
 		if (ci && cluster_is_usable(ci, 0))
-			offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset);
+			offset = alloc_swap_scan_cluster(si, ci, NULL,
+					pcp_offset,
+					__swp_tb_mk_count(
+						shadow_to_swp_tb(NULL, 0), 1));
 		else if (ci)
 			swap_cluster_unlock(ci);
 	}
 	if (!offset)
-		offset = cluster_alloc_swap_entry(si, NULL);
+		offset = cluster_alloc_swap_entry_tb(si, NULL, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0), 1));
 	local_unlock(&percpu_swap_cluster.lock);
 	if (offset)
 		entry = swp_entry(si->type, offset);
@@ -2915,6 +3125,7 @@ static int try_to_unuse(unsigned int type)
 	       (i = find_next_to_unuse(si, i)) != 0) {
 
 		entry = swp_entry(type, i);
+
 		folio = swap_cache_get_folio(entry);
 		if (!folio)
 			continue;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ca4533eba701..94b6cfcc28ac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -350,6 +350,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 		 */
 		if (get_nr_swap_pages() > 0)
 			return true;
+		/* vswap doesn't contribute to nr_swap_pages */
+		if (IS_ENABLED(CONFIG_VSWAP) && zswap_is_enabled())
+			return true;
 	} else {
 		/* Is the memcg below its swap limit? */
 		if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
@@ -2615,7 +2618,7 @@ static bool can_age_anon_pages(struct lruvec *lruvec,
 			       struct scan_control *sc)
 {
 	/* Aging the anon LRU is valuable if swap is present: */
-	if (total_swap_pages > 0)
+	if (total_swap_pages > 0 || (IS_ENABLED(CONFIG_VSWAP) && zswap_is_enabled()))
 		return true;
 
 	/* Also valuable if anon pages can be demoted: */
diff --git a/mm/vswap.h b/mm/vswap.h
index 094ff16cb5a4..5e6e5b88593c 100644
--- a/mm/vswap.h
+++ b/mm/vswap.h
@@ -7,23 +7,307 @@
 #ifndef _MM_VSWAP_H
 #define _MM_VSWAP_H
 
+
 #include <linux/swap.h>
 
+struct zswap_entry;
+
+static inline bool swap_is_vswap(struct swap_info_struct *si)
+{
+	return si->flags & SWP_VSWAP;
+}
+
 #ifdef CONFIG_VSWAP
 
+#include "swap.h"
+#include "swap_table.h"
+
 extern struct swap_info_struct *vswap_si;
 
-static inline bool swap_is_vswap(struct swap_info_struct *si)
+/*
+ * Virtual table entry encoding for vswap clusters.
+ *
+ * Each entry in ci_dyn->virtual_table stores the backing type and
+ * pointer for a virtual swap slot. Tag in low 3 bits, payload in
+ * upper 61 bits.
+ *
+ *   NONE:   |----- 0000 ------|000|  — free / unbacked
+ *   PHYS:   |-- (type:5,off:N)|001|  — on a physical swapfile (shifted)
+ *   ZERO:   |----- 0000 ------|010|  — zero-filled page
+ *   ZSWAP:  |--- zswap_entry* |011|  — compressed in zswap (tag in low bits)
+ *   FOLIO:  |--- folio* ------|100|  — in-memory only (tag in low bits)
+ *
+ * PHYS payloads are shifted left by 3. Pointer payloads (ZSWAP, FOLIO)
+ * are stored directly with the tag OR'd into the low bits (kernel
+ * pointers are >= 8-byte aligned, same approach as xarray).
+ */
+enum vswap_backing_type {
+	VSWAP_NONE	= 0,
+	VSWAP_SWAPFILE	= 1,
+	VSWAP_ZERO	= 2,
+	VSWAP_ZSWAP	= 3,
+	VSWAP_FOLIO	= 4,
+};
+
+#define VTABLE_TAG_BITS		3
+#define VTABLE_TAG_MASK		((1UL << VTABLE_TAG_BITS) - 1)
+
+static inline enum vswap_backing_type vtable_type(unsigned long vt)
 {
-	return si->flags & SWP_VSWAP;
+	return vt & VTABLE_TAG_MASK;
 }
 
-#else
+static inline unsigned long vtable_payload(unsigned long vt)
+{
+	return vt >> VTABLE_TAG_BITS;
+}
 
-static inline bool swap_is_vswap(struct swap_info_struct *si)
+static inline unsigned long vtable_mk(enum vswap_backing_type type,
+				       unsigned long payload)
+{
+	return (payload << VTABLE_TAG_BITS) | type;
+}
+
+static inline unsigned long vtable_mk_none(void)
+{
+	return 0;
+}
+
+static inline unsigned long vtable_mk_zero(void)
+{
+	return VSWAP_ZERO;
+}
+
+static inline unsigned long vtable_mk_zswap(struct zswap_entry *ze)
+{
+	return (unsigned long)ze | VSWAP_ZSWAP;
+}
+
+static inline struct zswap_entry *vtable_to_zswap(unsigned long vt)
+{
+	VM_WARN_ON(vtable_type(vt) != VSWAP_ZSWAP);
+	return (struct zswap_entry *)(vt & ~VTABLE_TAG_MASK);
+}
+
+static inline unsigned long vtable_mk_folio(struct folio *folio)
+{
+	return (unsigned long)folio | VSWAP_FOLIO;
+}
+
+static inline struct folio *vtable_to_folio(unsigned long vt)
+{
+	VM_WARN_ON(vtable_type(vt) != VSWAP_FOLIO);
+	return (struct folio *)(vt & ~VTABLE_TAG_MASK);
+}
+
+/* Virtual table accessors */
+
+static inline unsigned long __vtable_get(struct swap_cluster_info_dynamic *ci_dyn,
+					 unsigned int off)
+{
+	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+	return atomic_long_read(&ci_dyn->virtual_table[off]);
+}
+
+static inline void __vtable_set(struct swap_cluster_info_dynamic *ci_dyn,
+				unsigned int off, unsigned long vt)
+{
+	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+	atomic_long_set(&ci_dyn->virtual_table[off], vt);
+}
+
+/*
+ * Lock a vswap cluster and return the dynamic info + slot offset.
+ * Returns NULL if cluster not found.
+ * Caller must spin_unlock(&ci_dyn->ci.lock) when done.
+ */
+static inline struct swap_cluster_info_dynamic *
+vswap_lock_cluster(swp_entry_t entry, unsigned int *voff)
+{
+	struct swap_cluster_info *ci;
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci = __swap_entry_to_cluster(entry);
+	if (!ci)
+		return NULL;
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	*voff = swp_cluster_offset(entry);
+	spin_lock(&ci->lock);
+	return ci_dyn;
+}
+
+/* Zswap entry helpers — store/load/erase in virtual_table */
+
+void vswap_release_backing(struct swap_cluster_info *ci,
+			   unsigned int ci_start, unsigned int nr);
+
+static inline void vswap_zswap_store(swp_entry_t entry,
+				     struct zswap_entry *ze)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	unsigned int voff;
+
+	ci_dyn = vswap_lock_cluster(entry, &voff);
+	if (!ci_dyn)
+		return;
+	vswap_release_backing(&ci_dyn->ci, voff, 1);
+	__vtable_set(ci_dyn, voff, vtable_mk_zswap(ze));
+	spin_unlock(&ci_dyn->ci.lock);
+}
+
+static inline struct zswap_entry *vswap_zswap_load(swp_entry_t entry)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	unsigned int voff;
+	unsigned long vt;
+
+	ci_dyn = vswap_lock_cluster(entry, &voff);
+	if (!ci_dyn)
+		return NULL;
+	vt = __vtable_get(ci_dyn, voff);
+	spin_unlock(&ci_dyn->ci.lock);
+
+	if (vtable_type(vt) != VSWAP_ZSWAP)
+		return NULL;
+	return vtable_to_zswap(vt);
+}
+
+
+void vswap_store_folio(swp_entry_t entry, struct folio *folio);
+void vswap_prepare_writeout(swp_entry_t entry, struct folio *folio);
+
+/*
+ * Check that all nr vtable entries starting at entry have the same
+ * backing type. Returns the number of matching entries (< nr on
+ * mismatch).
+ */
+static inline int vswap_check_backing(swp_entry_t entry, int nr,
+				      enum vswap_backing_type *typep)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	enum vswap_backing_type first_type;
+	unsigned int voff;
+	unsigned long vt;
+	int i;
+
+	ci_dyn = vswap_lock_cluster(entry, &voff);
+	if (!ci_dyn)
+		return 0;
+
+	for (i = 0; i < nr; i++) {
+		vt = __vtable_get(ci_dyn, voff + i);
+		if (!i)
+			first_type = vtable_type(vt);
+		else if (vtable_type(vt) != first_type)
+			break;
+	}
+	spin_unlock(&ci_dyn->ci.lock);
+
+	if (typep)
+		*typep = first_type;
+	return i;
+}
+
+static inline bool vswap_can_swapin_thp(swp_entry_t entry, int nr)
+{
+	enum vswap_backing_type type;
+
+	return vswap_check_backing(entry, nr, &type) == nr &&
+	       type == VSWAP_ZERO;
+}
+
+static inline int vswap_cluster_alloc_vtable(struct swap_cluster_info_dynamic *ci_dyn)
+{
+	ci_dyn->virtual_table = kcalloc(SWAPFILE_CLUSTER,
+					sizeof(*ci_dyn->virtual_table),
+					GFP_ATOMIC);
+	return ci_dyn->virtual_table ? 0 : -ENOMEM;
+}
+
+static inline void vswap_cluster_free_vtable(struct swap_cluster_info *ci)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	kfree(ci_dyn->virtual_table);
+	ci_dyn->virtual_table = NULL;
+}
+
+/* Low-level setter for callers already holding the cluster lock */
+static inline void vswap_set_zswap(struct swap_cluster_info *ci,
+				   unsigned int ci_off,
+				   struct zswap_entry *ze)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	__vtable_set(ci_dyn, ci_off, vtable_mk_zswap(ze));
+}
+
+/* Zeromap helpers — test/set ZERO backing in virtual_table */
+
+static inline bool vswap_test_zero(struct swap_cluster_info *ci,
+				   unsigned int ci_off)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	return vtable_type(__vtable_get(ci_dyn, ci_off)) == VSWAP_ZERO;
+}
+
+static inline void vswap_set_zero(struct swap_cluster_info *ci,
+				  unsigned int ci_off)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	__vtable_set(ci_dyn, ci_off, vtable_mk_zero());
+}
+
+#else /* !CONFIG_VSWAP */
+
+static inline void vswap_release_backing(struct swap_cluster_info *ci,
+					 unsigned int ci_start,
+					 unsigned int nr) {}
+
+static inline void vswap_zswap_store(swp_entry_t entry,
+				     struct zswap_entry *ze) {}
+
+static inline struct zswap_entry *vswap_zswap_load(swp_entry_t entry)
+{
+	return NULL;
+}
+
+static inline void vswap_store_folio(swp_entry_t entry,
+				     struct folio *folio) {}
+static inline void vswap_prepare_writeout(swp_entry_t entry,
+					  struct folio *folio) {}
+
+static inline bool vswap_can_swapin_thp(swp_entry_t entry, int nr)
+{
+	return false;
+}
+
+struct swap_cluster_info_dynamic;
+static inline int vswap_cluster_alloc_vtable(struct swap_cluster_info_dynamic *ci_dyn)
+{
+	return 0;
+}
+
+static inline void vswap_cluster_free_vtable(struct swap_cluster_info *ci) {}
+
+static inline void vswap_set_zswap(struct swap_cluster_info *ci,
+				   unsigned int ci_off,
+				   struct zswap_entry *ze) {}
+
+static inline bool vswap_test_zero(struct swap_cluster_info *ci,
+				   unsigned int ci_off)
 {
 	return false;
 }
 
+static inline void vswap_set_zero(struct swap_cluster_info *ci,
+				  unsigned int ci_off) {}
+
 #endif /* CONFIG_VSWAP */
 #endif /* _MM_VSWAP_H */
diff --git a/mm/zswap.c b/mm/zswap.c
index 993406074d58..c57bf0246bb2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -38,6 +38,7 @@
 #include <linux/zsmalloc.h>
 
 #include "swap.h"
+#include "vswap.h"
 #include "internal.h"
 
 /*********************************
@@ -762,7 +763,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
  * Carries out the common pattern of freeing an entry's zsmalloc allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
  */
-static void zswap_entry_free(struct zswap_entry *entry)
+void zswap_entry_free(struct zswap_entry *entry)
 {
 	zswap_lru_del(&zswap_list_lru, entry);
 	zs_free(entry->pool->zs_pool, entry->handle);
@@ -994,16 +995,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct swap_info_struct *si;
 	int ret = 0;
 
+	/* try to allocate swap cache folio */
 	si = get_swap_device(swpentry);
 	if (!si)
 		return -EEXIST;
 
+	/*
+	 * Vswap entries have no physical backing — writeback would fail
+	 * and SIGBUS the caller. Bail before we waste a swap-cache folio
+	 * allocation.
+	 */
 	if (si->flags & SWP_VSWAP) {
 		put_swap_device(si);
 		return -EINVAL;
 	}
 
-	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
 	folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
 				       NO_INTERLEAVE_INDEX);
@@ -1206,6 +1212,18 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 	if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
 		return 0;
 
+	/*
+	 * With CONFIG_VSWAP and zswap enabled, every zswap entry is
+	 * vswap-backed and needs a physical swap slot allocated on demand
+	 * (via folio_realloc_swap) for writeback. If no physical slots are
+	 * available, writeback will fail — skip the shrinker to avoid
+	 * spinning on entries we cannot drain. Vanilla zswap-on-swapfile is
+	 * unaffected because every zswap entry already has a backing slot;
+	 * gate on CONFIG_VSWAP so the check compiles out there.
+	 */
+	if (IS_ENABLED(CONFIG_VSWAP) && !get_nr_swap_pages())
+		return 0;
+
 	/*
 	 * The shrinker resumes swap writeback, which will enter block
 	 * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS
@@ -1416,25 +1434,25 @@ static bool zswap_store_page(struct page *page,
 	if (!zswap_compress(page, entry, pool))
 		goto compress_failed;
 
-	old = xa_store(swap_zswap_tree(page_swpentry),
-		       swp_offset(page_swpentry),
-		       entry, GFP_KERNEL);
-	if (xa_is_err(old)) {
-		int err = xa_err(old);
+	if (swap_is_vswap(__swap_entry_to_info(page_swpentry))) {
+		vswap_zswap_store(page_swpentry, entry);
+	} else {
+		old = xa_store(swap_zswap_tree(page_swpentry),
+			       swp_offset(page_swpentry),
+			       entry, GFP_KERNEL);
+		if (xa_is_err(old)) {
+			int err = xa_err(old);
+
+			WARN_ONCE(err != -ENOMEM,
+				  "unexpected xarray error: %d\n", err);
+			zswap_reject_alloc_fail++;
+			goto store_failed;
+		}
 
-		WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
-		zswap_reject_alloc_fail++;
-		goto store_failed;
+		if (old)
+			zswap_entry_free(old);
 	}
 
-	/*
-	 * We may have had an existing entry that became stale when
-	 * the folio was redirtied and now the new version is being
-	 * swapped out. Get rid of the old.
-	 */
-	if (old)
-		zswap_entry_free(old);
-
 	/*
 	 * The entry is successfully compressed and stored in the tree, there is
 	 * no further possibility of failure. Grab refs to the pool and objcg,
@@ -1533,6 +1551,8 @@ bool zswap_store(struct folio *folio)
 
 	count_vm_events(ZSWPOUT, nr_pages);
 
+	/* zswap_store_page stores directly in virtual_table for vswap */
+
 	ret = true;
 
 put_pool:
@@ -1547,8 +1567,14 @@ bool zswap_store(struct folio *folio)
 	 * the possibly stale entries which were previously stored at the
 	 * offsets corresponding to each page of the folio. Otherwise,
 	 * writeback could overwrite the new data in the swapfile.
+	 *
+	 * vswap stores zswap entries directly in the per-slot virtual_table
+	 * (no per-device xarray), so the stale-entry cleanup is implicit:
+	 * a successful vswap_zswap_store overwrites the slot via
+	 * vswap_release_backing, and a failed store leaves the old backing
+	 * untouched.
 	 */
-	if (!ret) {
+	if (!ret && !swap_is_vswap(__swap_entry_to_info(swp))) {
 		unsigned type = swp_type(swp);
 		pgoff_t offset = swp_offset(swp);
 		struct zswap_entry *entry;
@@ -1588,8 +1614,7 @@ bool zswap_store(struct folio *folio)
 int zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	pgoff_t offset = swp_offset(swp);
-	struct xarray *tree = swap_zswap_tree(swp);
+	struct swap_info_struct *si = __swap_entry_to_info(swp);
 	struct zswap_entry *entry;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
@@ -1599,16 +1624,25 @@ int zswap_load(struct folio *folio)
 		return -ENOENT;
 
 	/*
-	 * Large folios should not be swapped in while zswap is being used, as
-	 * they are not properly handled. Zswap does not properly load large
-	 * folios, and a large folio may only be partially in zswap.
+	 * zswap_load() does not support large folios. For non-vswap
+	 * entries this is unexpected on the swapin path: WARN and
+	 * sigbus. For vswap entries vswap_can_swapin_thp() has already
+	 * filtered out ZSWAP-backed THPs, so the large folio here is
+	 * zero- or phys-backed; return -ENOENT to fall through to the
+	 * phys/zero IO path.
 	 */
-	if (WARN_ON_ONCE(folio_test_large(folio))) {
-		folio_unlock(folio);
-		return -EINVAL;
+	if (folio_test_large(folio)) {
+		if (WARN_ON_ONCE(!swap_is_vswap(si))) {
+			folio_unlock(folio);
+			return -EINVAL;
+		}
+		return -ENOENT;
 	}
 
-	entry = xa_load(tree, offset);
+	if (swap_is_vswap(si))
+		entry = vswap_zswap_load(swp);
+	else
+		entry = xa_load(swap_zswap_tree(swp), swp_offset(swp));
 	if (!entry)
 		return -ENOENT;
 
@@ -1623,16 +1657,14 @@ int zswap_load(struct folio *folio)
 	if (entry->objcg)
 		count_objcg_events(entry->objcg, ZSWPIN, 1);
 
-	/*
-	 * We are reading into the swapcache, invalidate zswap entry.
-	 * The swapcache is the authoritative owner of the page and
-	 * its mappings, and the pressure that results from having two
-	 * in-memory copies outweighs any benefits of caching the
-	 * compression work.
-	 */
 	folio_mark_dirty(folio);
-	xa_erase(tree, offset);
-	zswap_entry_free(entry);
+
+	if (swap_is_vswap(si)) {
+		vswap_store_folio(swp, folio);
+	} else {
+		xa_erase(swap_zswap_tree(swp), swp_offset(swp));
+		zswap_entry_free(entry);
+	}
 
 	folio_unlock(folio);
 	return 0;
-- 
2.53.0-Meta


^ permalink raw reply related

* [RFC PATCH 3/5] mm, swap: support physical swap as a vswap backend
From: Nhat Pham @ 2026-05-28 21:29 UTC (permalink / raw)
  To: kasong
  Cc: Liam.Howlett, akpm, apopple, axelrasmussen, baohua, baolin.wang,
	bhe, byungchul, cgroups, chengming.zhou, chrisl, corbet, david,
	dev.jain, gourry, hannes, hughd, jannh, joshua.hahnjy, lance.yang,
	lenb, linux-doc, linux-kernel, linux-mm, linux-pm,
	lorenzo.stoakes, matthew.brost, mhocko, muchun.song, npache,
	nphamcs, pavel, peterx, peterz, pfalcato, rafael, rakie.kim,
	roman.gushchin, rppt, ryan.roberts, shakeel.butt, shikemeng,
	surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed, yuanchu,
	zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>

Add physical swap as a backend for the virtual swap layer.
Without this, vswap can only back entries with zswap or zero pages,
and a zswap_store failure has nowhere to fall back to — the page
stays dirty in swap cache (AOP_WRITEPAGE_ACTIVATE).

With physical swap backing, vswap can allocate a physical slot on
demand when needed: as a fallback for zswap_store failures, or as
the destination for zswap writeback.

Each vswap entry's physical slot is tracked via a Pointer-tagged
swap_table entry on the physical cluster (rmap back to the vswap
entry).

Suggested-by: Kairui Song <kasong@tencent.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/swap.h |  10 ++
 mm/memcontrol.c      |   8 +-
 mm/memory.c          |  14 +-
 mm/page_io.c         | 130 ++++++++++----
 mm/swap.h            |  11 ++
 mm/swap_table.h      |   1 +
 mm/swapfile.c        | 398 ++++++++++++++++++++++++++++++++++++++++---
 mm/vswap.h           | 138 ++++++++++++++-
 mm/zswap.c           |  79 ++++++---
 9 files changed, 698 insertions(+), 91 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ee9b1e76b058..3fb55485fc76 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -449,6 +449,16 @@ extern int swp_swapcount(swp_entry_t entry);
 struct backing_dev_info;
 extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
 sector_t swap_folio_sector(struct folio *folio);
+sector_t swap_entry_sector(swp_entry_t entry);
+
+#ifdef CONFIG_VSWAP
+swp_entry_t folio_realloc_swap(struct folio *folio);
+#else
+static inline swp_entry_t folio_realloc_swap(struct folio *folio)
+{
+	return (swp_entry_t){};
+}
+#endif
 
 /*
  * If there is an existing swap slot reference (swap entry) and the caller
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a3ad83c229f7..7492879b3239 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5541,7 +5541,13 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
 {
 	long nr_swap_pages;
 
-	/* vswap provides unbounded virtual swap when zswap is enabled */
+	/*
+	 * vswap provides unbounded virtual swap when zswap is enabled.
+	 * (No per-memcg may_zswap check — mem_cgroup_may_zswap can sleep
+	 * via __mem_cgroup_flush_stats, but this is callable from
+	 * rcu_read_lock contexts like cachestat(2) → workingset_test_recent.
+	 * The per-memcg swap.max is still enforced at charge time.)
+	 */
 	if (IS_ENABLED(CONFIG_VSWAP) && zswap_is_enabled())
 		return PAGE_COUNTER_MAX;
 
diff --git a/mm/memory.c b/mm/memory.c
index c3050e49b086..d15c748d4f90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -89,6 +89,7 @@
 #include "pgalloc-track.h"
 #include "internal.h"
 #include "swap.h"
+#include "vswap.h"
 
 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
@@ -4523,7 +4524,14 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
 	 * are fast, and meanwhile, swap cache pinning the slot deferring the
 	 * release of metadata or fragmentation is a more critical issue.
 	 */
-	if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+	if (swap_entry_backend_has_flag(si, folio->swap, SWP_SYNCHRONOUS_IO))
+		return true;
+	/*
+	 * Non-swapfile backends cannot be reused for future swapouts.
+	 * Free the swap slot unless backed by contiguous physical swap.
+	 */
+	if (swap_is_vswap(si) &&
+	    !vswap_swapfile_backed(folio->swap, folio_nr_pages(folio)))
 		return true;
 	if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
 	    folio_test_mlocked(folio))
@@ -4832,7 +4840,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		swap_update_readahead(folio, vma, vmf->address);
 	if (!folio) {
 		/* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
-		if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+		if (swap_entry_backend_has_flag(si, entry, SWP_SYNCHRONOUS_IO))
 			folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
 					    thp_swapin_suitable_orders(vmf) | BIT(0),
 					    vmf, NULL, 0);
@@ -5007,7 +5015,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			 */
 			exclusive = true;
 		} else if (exclusive && folio_test_writeback(folio) &&
-			  data_race(si->flags & SWP_STABLE_WRITES)) {
+			  swap_entry_backend_has_flag(si, entry, SWP_STABLE_WRITES)) {
 			/*
 			 * This is tricky: not all swap backends support
 			 * concurrent page modifications while under writeback.
diff --git a/mm/page_io.c b/mm/page_io.c
index b3c7e56c8eed..a65734564819 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -260,6 +260,7 @@ static void swap_zeromap_folio_clear(struct folio *folio)
  */
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 {
+	swp_entry_t phys;
 	int ret = 0;
 
 	if (folio_free_swap(folio))
@@ -292,6 +293,12 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	 */
 	swap_zeromap_folio_clear(folio);
 
+	/*
+	 * For vswap: release stale non-swapfile backends before writeout.
+	 * If already PHYS-backed (contiguous), keep it. Otherwise free old
+	 * backing (e.g. ZSWAP from a previous swapout cycle) and set FOLIO
+	 * so zswap_store or folio_realloc_swap starts clean.
+	 */
 	if (swap_is_vswap(__swap_entry_to_info(folio->swap)))
 		vswap_prepare_writeout(folio->swap, folio);
 
@@ -309,8 +316,19 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 	rcu_read_unlock();
 
 	if (swap_is_vswap(__swap_entry_to_info(folio->swap))) {
-		folio_mark_dirty(folio);
-		return AOP_WRITEPAGE_ACTIVATE;
+		/*
+		 * zswap_store may have partially populated the vtable with
+		 * ZSWAP entries before failing. Reset to FOLIO (freeing
+		 * those partial entries) so folio_realloc_swap can install
+		 * PHYS cleanly without leaking zswap_entry pointers.
+		 */
+		vswap_prepare_writeout(folio->swap, folio);
+		phys = folio_realloc_swap(folio);
+		if (!phys.val) {
+			folio_mark_dirty(folio);
+			return AOP_WRITEPAGE_ACTIVATE;
+		}
+		return __swap_writepage_phys(folio, swap_plug, phys);
 	}
 
 	return __swap_writepage(folio, swap_plug);
@@ -402,12 +420,12 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
 	mempool_free(sio, sio_pool);
 }
 
-static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
+static void swap_writepage_fs(struct folio *folio,
+			      struct swap_info_struct *sis, loff_t pos,
+			      struct swap_iocb **swap_plug)
 {
 	struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
-	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 	struct file *swap_file = sis->swap_file;
-	loff_t pos = swap_dev_pos(folio->swap);
 
 	count_swpout_vm_event(folio);
 	folio_start_writeback(folio);
@@ -439,13 +457,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
 }
 
 static void swap_writepage_bdev_sync(struct folio *folio,
-		struct swap_info_struct *sis)
+		struct swap_info_struct *sis, sector_t sector)
 {
 	struct bio_vec bv;
 	struct bio bio;
 
 	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
-	bio.bi_iter.bi_sector = swap_folio_sector(folio);
+	bio.bi_iter.bi_sector = sector;
 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
 
 	bio_associate_blkg_from_page(&bio, folio);
@@ -475,6 +493,42 @@ static void swap_writepage_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
+#ifdef CONFIG_VSWAP
+int __swap_writepage_phys(struct folio *folio, struct swap_iocb **swap_plug,
+			  swp_entry_t phys_entry)
+{
+	struct swap_info_struct *sis = __swap_entry_to_info(phys_entry);
+	sector_t sector = swap_entry_sector(phys_entry);
+	struct bio *bio;
+
+	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+	VM_WARN_ON(swap_is_vswap(sis));
+
+	if (data_race(sis->flags & SWP_FS_OPS)) {
+		swap_writepage_fs(folio, sis, swap_dev_pos(phys_entry),
+				  swap_plug);
+		return 0;
+	}
+
+	if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) {
+		swap_writepage_bdev_sync(folio, sis, sector);
+		return 0;
+	}
+
+	bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_end_io = end_swap_bio_write;
+	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+
+	bio_associate_blkg_from_page(bio, folio);
+	count_swpout_vm_event(folio);
+	folio_start_writeback(folio);
+	folio_unlock(folio);
+	submit_bio(bio);
+	return 0;
+}
+#endif
+
 int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
@@ -493,14 +547,10 @@ int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
 	 * is safe.
 	 */
 	if (data_race(sis->flags & SWP_FS_OPS))
-		swap_writepage_fs(folio, swap_plug);
-	/*
-	 * ->flags can be updated non-atomically,
-	 * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
-	 * is safe.
-	 */
+		swap_writepage_fs(folio, sis, swap_dev_pos(folio->swap),
+				  swap_plug);
 	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
-		swap_writepage_bdev_sync(folio, sis);
+		swap_writepage_bdev_sync(folio, sis, swap_folio_sector(folio));
 	else
 		swap_writepage_bdev_async(folio, sis);
 	return 0;
@@ -624,11 +674,11 @@ static bool swap_read_folio_zeromap(struct folio *folio)
 	return true;
 }
 
-static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
+static void swap_read_folio_fs(struct folio *folio,
+			       struct swap_info_struct *sis, loff_t pos,
+			       struct swap_iocb **plug)
 {
-	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 	struct swap_iocb *sio = NULL;
-	loff_t pos = swap_dev_pos(folio->swap);
 
 	if (plug)
 		sio = *plug;
@@ -659,13 +709,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
 }
 
 static void swap_read_folio_bdev_sync(struct folio *folio,
-		struct swap_info_struct *sis)
+		struct swap_info_struct *sis, sector_t sector)
 {
 	struct bio_vec bv;
 	struct bio bio;
 
 	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
-	bio.bi_iter.bi_sector = swap_folio_sector(folio);
+	bio.bi_iter.bi_sector = sector;
 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
 	/*
 	 * Keep this task valid during swap readpage because the oom killer may
@@ -681,12 +731,12 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
 }
 
 static void swap_read_folio_bdev_async(struct folio *folio,
-		struct swap_info_struct *sis)
+		struct swap_info_struct *sis, sector_t sector)
 {
 	struct bio *bio;
 
 	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
-	bio->bi_iter.bi_sector = swap_folio_sector(folio);
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = end_swap_bio_read;
 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
 	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
@@ -695,6 +745,22 @@ static void swap_read_folio_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
+static void swap_read_folio_phys(struct folio *folio, swp_entry_t phys_entry,
+				struct swap_iocb **plug)
+{
+	struct swap_info_struct *sis = __swap_entry_to_info(phys_entry);
+	sector_t sector = swap_entry_sector(phys_entry);
+
+	zswap_folio_swapin(folio);
+
+	if (data_race(sis->flags & SWP_FS_OPS))
+		swap_read_folio_fs(folio, sis, swap_dev_pos(phys_entry), plug);
+	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
+		swap_read_folio_bdev_sync(folio, sis, sector);
+	else
+		swap_read_folio_bdev_async(folio, sis, sector);
+}
+
 void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
@@ -702,6 +768,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	bool workingset = folio_test_workingset(folio);
 	unsigned long pflags;
 	bool in_thrashing;
+	swp_entry_t phys;
 
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -726,20 +793,15 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	if (zswap_load(folio) != -ENOENT)
 		goto finish;
 
-	if (unlikely(sis->flags & SWP_VSWAP)) {
-		folio_unlock(folio);
-		goto finish;
-	}
-
-	/* We have to read from slower devices. Increase zswap protection. */
-	zswap_folio_swapin(folio);
-
-	if (data_race(sis->flags & SWP_FS_OPS)) {
-		swap_read_folio_fs(folio, plug);
-	} else if (synchronous) {
-		swap_read_folio_bdev_sync(folio, sis);
+	if (swap_is_vswap(sis)) {
+		phys = vswap_to_phys(folio->swap);
+		if (!phys.val) {
+			folio_unlock(folio);
+			goto finish;
+		}
+		swap_read_folio_phys(folio, phys, plug);
 	} else {
-		swap_read_folio_bdev_async(folio, sis);
+		swap_read_folio_phys(folio, folio->swap, plug);
 	}
 
 finish:
diff --git a/mm/swap.h b/mm/swap.h
index 640413e30880..50c90a35382c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -285,6 +285,17 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 void swap_write_unplug(struct swap_iocb *sio);
 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
 int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+#ifdef CONFIG_VSWAP
+int __swap_writepage_phys(struct folio *folio, struct swap_iocb **swap_plug,
+			  swp_entry_t phys_entry);
+#else
+static inline int __swap_writepage_phys(struct folio *folio,
+					struct swap_iocb **swap_plug,
+					swp_entry_t phys_entry)
+{
+	return -EINVAL;
+}
+#endif
 
 /* linux/mm/swap_state.c */
 extern struct address_space swap_space __read_mostly;
diff --git a/mm/swap_table.h b/mm/swap_table.h
index b0e7ef9c966b..814bc75597a0 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -406,6 +406,7 @@ static inline swp_entry_t swp_tb_ptr_to_swp_entry(unsigned long swp_tb)
 	return entry;
 }
 #else
+#define SWP_RMAP_CACHE_ONLY	0UL
 static inline bool swp_tb_is_pointer(unsigned long swp_tb)
 {
 	return false;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c90d83fd628a..a0976be6a12b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -145,10 +145,16 @@ static DEFINE_PER_CPU(struct percpu_vswap_cluster, percpu_vswap_cluster) = {
 static bool vswap_alloc(struct folio *folio);
 static void vswap_free_cluster(struct swap_info_struct *si,
 			       struct swap_cluster_info *ci);
+static void vswap_mark_cache_only(struct swap_info_struct *si,
+				  struct swap_cluster_info *ci,
+				  unsigned int ci_off);
 #else
 static inline bool vswap_alloc(struct folio *folio) { return false; }
 static inline void vswap_free_cluster(struct swap_info_struct *si,
 				      struct swap_cluster_info *ci) {}
+static inline void vswap_mark_cache_only(struct swap_info_struct *si,
+					 struct swap_cluster_info *ci,
+					 unsigned int ci_off) {}
 #endif
 
 /* May return NULL on invalid type, caller must check for NULL return */
@@ -350,19 +356,24 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
 	BUG();
 }
 
-sector_t swap_folio_sector(struct folio *folio)
+sector_t swap_entry_sector(swp_entry_t entry)
 {
-	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
+	struct swap_info_struct *sis = __swap_entry_to_info(entry);
 	struct swap_extent *se;
 	sector_t sector;
 	pgoff_t offset;
 
-	offset = swp_offset(folio->swap);
+	offset = swp_offset(entry);
 	se = offset_to_swap_extent(sis, offset);
 	sector = se->start_block + (offset - se->start_page);
 	return sector << (PAGE_SHIFT - 9);
 }
 
+sector_t swap_folio_sector(struct folio *folio)
+{
+	return swap_entry_sector(folio->swap);
+}
+
 /*
  * swap allocation tell device that a cluster of swap can now be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -880,6 +891,60 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
 	return ret;
 }
 
+/*
+ * Try to reclaim a Pointer-tagged physical slot backing a vswap entry.
+ * The physical cluster lock must NOT be held. Returns < 0 on failure.
+ */
+static int try_to_reclaim_vswap_backing(struct swap_info_struct *si,
+					unsigned long offset)
+{
+	struct swap_cluster_info *ci;
+	swp_entry_t vswap_entry, phys_entry;
+	struct folio *folio;
+	unsigned long swp_tb;
+	unsigned int ci_off;
+
+	ci = swap_cluster_lock(si, offset);
+	if (!ci)
+		return -1;
+	ci_off = offset % SWAPFILE_CLUSTER;
+	swp_tb = __swap_table_get(ci, ci_off);
+	if (!swp_tb_is_pointer(swp_tb) || !(swp_tb & SWP_RMAP_CACHE_ONLY)) {
+		swap_cluster_unlock(ci);
+		return -1;
+	}
+	vswap_entry = swp_tb_ptr_to_swp_entry(swp_tb);
+	swap_cluster_unlock(ci);
+
+	folio = swap_cache_get_folio(vswap_entry);
+	if (!folio)
+		return -1;
+
+	if (!folio_trylock(folio)) {
+		folio_put(folio);
+		return -1;
+	}
+
+	if (!folio_matches_swap_entry(folio, vswap_entry)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return -1;
+	}
+
+	phys_entry = vswap_to_phys(vswap_entry);
+	if (!phys_entry.val || swp_offset(phys_entry) != offset ||
+	    swp_type(phys_entry) != si->type) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return -1;
+	}
+
+	vswap_store_folio(vswap_entry, folio);
+	folio_unlock(folio);
+	folio_put(folio);
+	return 0;
+}
+
 /*
  * Reclaim drops the ci lock, so the cluster may become unusable (freed or
  * stolen by a lower order). @usable will be set to false if that happens.
@@ -903,8 +968,13 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 	spin_unlock(&ci->lock);
 	do {
 		swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-		if (swp_tb_is_pointer(swp_tb))
-			break;
+		if (swp_tb_is_pointer(swp_tb)) {
+			rcu_read_unlock();
+			if (try_to_reclaim_vswap_backing(si, offset) < 0)
+				goto relock;
+			rcu_read_lock();
+			continue;
+		}
 		if (swp_tb_get_count(swp_tb))
 			break;
 		if (swp_tb_is_folio(swp_tb))
@@ -912,6 +982,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 				break;
 	} while (++offset < end);
 	rcu_read_unlock();
+relock:
 
 	/* Re-lookup: dynamic cluster may have been freed while lock was dropped */
 	ci = swap_cluster_lock(si, start);
@@ -983,6 +1054,8 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
 					 unsigned int order)
 {
 	unsigned long nr_pages = 1 << order;
+	swp_entry_t vswap_entry, v;
+	unsigned int i;
 
 	lockdep_assert_held(&ci->lock);
 
@@ -991,11 +1064,24 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
 
 	swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
 
-	if (swp_tb_is_folio(swp_tb))
+	if (swp_tb_is_folio(swp_tb)) {
 		__swap_cache_add_folio(ci, folio, swp_entry(si->type,
 							    ci_off + cluster_offset(si, ci)));
-	else
+	} else if (swp_tb_is_pointer(swp_tb) && nr_pages > 1) {
+		/*
+		 * Pointer-tagged rmap for vswap-backing THP — each
+		 * physical slot points back to its own vswap entry.
+		 */
+		vswap_entry = folio->swap;
+		for (i = 0; i < nr_pages; i++) {
+			v = vswap_entry;
+			v.val += i;
+			__swap_table_set(ci, ci_off + i,
+					 swp_entry_to_swp_tb_ptr(v));
+		}
+	} else {
 		__swap_table_set(ci, ci_off, swp_tb);
+	}
 
 	/*
 	 * The first allocation in a cluster makes the
@@ -1167,6 +1253,13 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 					offset += abs(nr_reclaim);
 					continue;
 				}
+			} else if (swp_tb_is_pointer(swp_tb) &&
+				   swap_rmap_is_cache_only(ci, offset % SWAPFILE_CLUSTER)) {
+				spin_unlock(&ci->lock);
+				try_to_reclaim_vswap_backing(si, offset);
+				ci = swap_cluster_lock(si, offset);
+				if (!ci)
+					goto next;
 			}
 			offset++;
 		}
@@ -1507,7 +1600,14 @@ static swp_entry_t swap_alloc_fast(struct folio *folio)
 	if (!si || !offset || !get_swap_device_info(si))
 		return (swp_entry_t){};
 
-	swp_tb = folio_to_swp_tb(folio, 0);
+	/*
+	 * Folio already in swap cache: allocating physical backing for a
+	 * vswap entry (folio_realloc_swap).
+	 */
+	if (folio_test_swapcache(folio))
+		swp_tb = swp_entry_to_swp_tb_ptr(folio->swap);
+	else
+		swp_tb = folio_to_swp_tb(folio, 0);
 
 	ci = swap_cluster_lock(si, offset);
 	if (ci && cluster_is_usable(ci, order)) {
@@ -1530,7 +1630,11 @@ static swp_entry_t swap_alloc_slow(struct folio *folio)
 	struct swap_info_struct *si, *next;
 	unsigned long swp_tb, found;
 
-	swp_tb = folio_to_swp_tb(folio, 0);
+	/* See comment in swap_alloc_fast() */
+	if (folio_test_swapcache(folio))
+		swp_tb = swp_entry_to_swp_tb_ptr(folio->swap);
+	else
+		swp_tb = folio_to_swp_tb(folio, 0);
 
 	spin_lock(&swap_avail_lock);
 start_over:
@@ -1722,6 +1826,8 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
 			}
 			/* count will be 0 after put, slot can be reclaimed */
 			need_reclaim = true;
+			if (swap_is_vswap(si))
+				vswap_mark_cache_only(si, ci, ci_off);
 		}
 		/*
 		 * A count != 1 or cached slot can't be freed. Put its swap
@@ -1922,12 +2028,7 @@ int folio_alloc_swap(struct folio *folio)
 		}
 	}
 
-	/*
-	 * Skip vswap when zswap is disabled — without zswap, vswap entries
-	 * have nowhere to go on writeout (no physical fallback yet; that
-	 * arrives in the next patch).
-	 */
-	if (zswap_is_enabled() && vswap_alloc(folio))
+	if (vswap_alloc(folio))
 		goto done;
 
 again:
@@ -1953,6 +2054,25 @@ int folio_alloc_swap(struct folio *folio)
 }
 
 #ifdef CONFIG_VSWAP
+static void vswap_mark_cache_only(struct swap_info_struct *si,
+				  struct swap_cluster_info *ci,
+				  unsigned int ci_off)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	struct swap_cluster_info *pci;
+	swp_entry_t phys;
+	unsigned long vt;
+
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	vt = __vtable_get(ci_dyn, ci_off);
+
+	if (vtable_type(vt) == VSWAP_SWAPFILE) {
+		phys = vtable_to_phys(vt);
+		pci = __swap_entry_to_cluster(phys);
+		swap_rmap_mark_cache_only(pci, swp_cluster_offset(phys));
+	}
+}
+
 static void vswap_free_cluster(struct swap_info_struct *si,
 			       struct swap_cluster_info *ci)
 {
@@ -1971,12 +2091,21 @@ static void vswap_free_cluster(struct swap_info_struct *si,
 	kfree_rcu(ci_dyn, rcu);
 }
 
+static void __swap_cluster_free_phys_backing(struct swap_info_struct *psi,
+					     struct swap_cluster_info *pci,
+					     unsigned int ci_start,
+					     unsigned int nr_pages);
+
 void vswap_release_backing(struct swap_cluster_info *ci,
 			   unsigned int ci_start, unsigned int nr)
 {
 	struct swap_cluster_info_dynamic *ci_dyn;
+	struct swap_info_struct *psi;
+	unsigned long phys_start = 0, phys_end = 0;
+	unsigned int phys_type = 0;
 	unsigned int ci_off;
 	unsigned long vt;
+	swp_entry_t phys;
 
 	lockdep_assert_held(&ci->lock);
 	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
@@ -1984,12 +2113,41 @@ void vswap_release_backing(struct swap_cluster_info *ci,
 	for (ci_off = ci_start; ci_off < ci_start + nr; ci_off++) {
 		vt = __vtable_get(ci_dyn, ci_off);
 
+		/*
+		 * Flush batched physical slots when the next entry
+		 * breaks contiguity, changes type/device, or would
+		 * cross a SWAPFILE_CLUSTER boundary (the free helper
+		 * operates on a single cluster).
+		 */
+		if (phys_start != phys_end &&
+		    (vtable_type(vt) != VSWAP_SWAPFILE ||
+		     swp_type(vtable_to_phys(vt)) != phys_type ||
+		     swp_offset(vtable_to_phys(vt)) != phys_end ||
+		     phys_end % SWAPFILE_CLUSTER == 0)) {
+			psi = __swap_type_to_info(phys_type);
+			__swap_cluster_free_phys_backing(psi,
+				__swap_entry_to_cluster(
+					swp_entry(phys_type, phys_start)),
+				phys_start % SWAPFILE_CLUSTER,
+				phys_end - phys_start);
+			phys_start = phys_end = 0;
+		}
+
 		switch (vtable_type(vt)) {
+		case VSWAP_SWAPFILE:
+			if (!phys_start) {
+				phys = vtable_to_phys(vt);
+				phys_start = swp_offset(phys);
+				phys_end = phys_start + 1;
+				phys_type = swp_type(phys);
+			} else {
+				phys_end++;
+			}
+			break;
 		case VSWAP_ZSWAP:
 			if (vtable_to_zswap(vt))
 				zswap_entry_free(vtable_to_zswap(vt));
 			break;
-		case VSWAP_SWAPFILE:
 		case VSWAP_FOLIO:
 		case VSWAP_ZERO:
 		case VSWAP_NONE:
@@ -1998,6 +2156,15 @@ void vswap_release_backing(struct swap_cluster_info *ci,
 
 		__vtable_set(ci_dyn, ci_off, vtable_mk_none());
 	}
+
+	if (phys_start != phys_end) {
+		psi = __swap_type_to_info(phys_type);
+		__swap_cluster_free_phys_backing(psi,
+			__swap_entry_to_cluster(
+				swp_entry(phys_type, phys_start)),
+			phys_start % SWAPFILE_CLUSTER,
+			phys_end - phys_start);
+	}
 }
 
 void vswap_store_folio(swp_entry_t entry, struct folio *folio)
@@ -2050,6 +2217,54 @@ void vswap_prepare_writeout(swp_entry_t entry, struct folio *folio)
 	spin_unlock(&ci->lock);
 }
 
+swp_entry_t folio_realloc_swap(struct folio *folio)
+{
+	swp_entry_t vswap_entry = folio->swap;
+	struct swap_cluster_info *ci;
+	struct swap_cluster_info_dynamic *ci_dyn;
+	unsigned int voff;
+	swp_entry_t phys_entry = {};
+	swp_entry_t pe;
+	int i, nr = folio_nr_pages(folio);
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+	VM_WARN_ON(!swap_is_vswap(__swap_entry_to_info(vswap_entry)));
+
+	phys_entry = vswap_to_phys(vswap_entry);
+	if (phys_entry.val)
+		return phys_entry;
+
+	local_lock(&percpu_swap_cluster.lock);
+	phys_entry = swap_alloc_fast(folio);
+	if (!phys_entry.val)
+		phys_entry = swap_alloc_slow(folio);
+	local_unlock(&percpu_swap_cluster.lock);
+
+	if (!phys_entry.val)
+		return (swp_entry_t){};
+
+	voff = swp_cluster_offset(vswap_entry);
+
+	ci = __swap_entry_to_cluster(vswap_entry);
+	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	spin_lock(&ci->lock);
+	/*
+	 * Install PHYS backing without freeing any prior contents of the
+	 * vtable. The caller is responsible for any cleanup of the prior
+	 * backing — for example, zswap_writeback_entry calls in with the
+	 * slot still pointing at the loaded zswap_entry (which it uses
+	 * for decompress before zswap_entry_free), and swap_writeout
+	 * calls vswap_prepare_writeout first to drop partial ZSWAP state.
+	 */
+	for (i = 0; i < nr; i++) {
+		pe.val = phys_entry.val + i;
+		__vtable_set(ci_dyn, voff + i, vtable_mk_phys(pe));
+	}
+	spin_unlock(&ci->lock);
+
+	return phys_entry;
+}
 #endif /* CONFIG_VSWAP */
 
 /**
@@ -2181,6 +2396,70 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
  * Free a set of swap slots after their swap count dropped to zero, or will be
  * zero after putting the last ref (saves one __swap_cluster_put_entry call).
  */
+#ifdef CONFIG_VSWAP
+/*
+ * Clear swap table entries to NULL and reset zero flags.
+ * Does not touch memcg or count — caller handles those.
+ */
+static void __swap_cluster_clear_table(struct swap_cluster_info *ci,
+				       unsigned int ci_start,
+				       unsigned int nr_pages)
+{
+	unsigned int ci_off;
+
+	lockdep_assert_held(&ci->lock);
+	for (ci_off = ci_start; ci_off < ci_start + nr_pages; ci_off++) {
+		__swap_table_set(ci, ci_off, null_to_swp_tb());
+		if (!SWAP_TABLE_HAS_ZEROFLAG)
+			__swap_table_clear_zero(ci, ci_off);
+	}
+}
+#endif
+
+/*
+ * Common tail for freeing swap slots: device-level accounting
+ * and cluster list management.
+ */
+static void __swap_cluster_finish_free(struct swap_info_struct *si,
+				       struct swap_cluster_info *ci,
+				       unsigned int ci_start,
+				       unsigned int nr_pages)
+{
+	lockdep_assert_held(&ci->lock);
+	swap_range_free(si, cluster_offset(si, ci) + ci_start, nr_pages);
+	swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
+
+	if (!ci->count)
+		free_cluster(si, ci);
+	else
+		partial_free_cluster(si, ci);
+}
+
+#ifdef CONFIG_VSWAP
+/*
+ * Free physical swap slots that were backing vswap entries (Pointer-tagged).
+ * Clears the physical swap table, decrements cluster count, and does
+ * device-level accounting. Called from vswap_release_backing.
+ */
+static void __swap_cluster_free_phys_backing(struct swap_info_struct *psi,
+					     struct swap_cluster_info *pci,
+					     unsigned int ci_start,
+					     unsigned int nr_pages)
+{
+	/*
+	 * Caller holds the vswap cluster lock (asserted in
+	 * vswap_release_backing). Nest the physical cluster lock under it
+	 * — same lockdep class, so use SINGLE_DEPTH_NESTING to silence
+	 * PROVE_LOCKING.
+	 */
+	spin_lock_nested(&pci->lock, SINGLE_DEPTH_NESTING);
+	VM_WARN_ON(pci->count < nr_pages);
+	pci->count -= nr_pages;
+	__swap_cluster_clear_table(pci, ci_start, nr_pages);
+	__swap_cluster_finish_free(psi, pci, ci_start, nr_pages);
+	swap_cluster_unlock(pci);
+}
+#endif
 void __swap_cluster_free_entries(struct swap_info_struct *si,
 				 struct swap_cluster_info *ci,
 				 unsigned int ci_start, unsigned int nr_pages)
@@ -2188,7 +2467,6 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 	unsigned long old_tb;
 	unsigned short batch_id = 0, id_cur;
 	unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
-	unsigned long ci_head = cluster_offset(si, ci);
 	unsigned int batch_off = ci_off;
 
 	VM_WARN_ON(ci->count < nr_pages);
@@ -2226,13 +2504,7 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 	if (batch_id)
 		mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
 
-	swap_range_free(si, ci_head + ci_start, nr_pages);
-	swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
-
-	if (!ci->count)
-		free_cluster(si, ci);
-	else
-		partial_free_cluster(si, ci);
+	__swap_cluster_finish_free(si, ci, ci_start, nr_pages);
 }
 
 int __swap_count(swp_entry_t entry)
@@ -3070,19 +3342,85 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 
 static int try_to_unuse(unsigned int type)
 {
+	struct swap_cluster_info *vci;
+	struct mempolicy mpol = { .mode = MPOL_DEFAULT };
 	struct mm_struct *prev_mm;
 	struct mm_struct *mm;
 	struct list_head *p;
 	int retval = 0;
 	struct swap_info_struct *si = swap_info[type];
 	struct folio *folio;
-	swp_entry_t entry;
-	unsigned int i;
+	swp_entry_t entry, vswap_entry;
+	unsigned long swp_tb;
+	unsigned int i, ci_off;
 
 	if (!swap_usage_in_pages(si))
 		goto success;
 
 retry:
+	/*
+	 * Free vswap-backing slots (Pointer-tagged) first. Walk physical
+	 * clusters, read the vswap entry from the rmap, ensure the data
+	 * is in the swap cache, and transition PHYS→FOLIO. No page table
+	 * walk needed — just free the physical backing.
+	 */
+	i = 0;
+	while (IS_ENABLED(CONFIG_VSWAP) &&
+	       swap_usage_in_pages(si) &&
+	       !signal_pending(current) &&
+	       (i = find_next_to_unuse(si, i)) != 0) {
+		swp_entry_t phys;
+
+		vci = __swap_offset_to_cluster(si, i);
+		if (!vci)
+			continue;
+		ci_off = i % SWAPFILE_CLUSTER;
+
+		spin_lock(&vci->lock);
+		swp_tb = __swap_table_get(vci, ci_off);
+		spin_unlock(&vci->lock);
+
+		if (!swp_tb_is_pointer(swp_tb))
+			continue;
+
+		vswap_entry = swp_tb_ptr_to_swp_entry(swp_tb);
+
+		folio = swap_cache_get_folio(vswap_entry);
+		if (!folio) {
+			folio = swap_cache_alloc_folio(vswap_entry,
+						      GFP_KERNEL, BIT(0), NULL,
+						      &mpol, NO_INTERLEAVE_INDEX);
+			if (IS_ERR_OR_NULL(folio))
+				continue;
+			swap_read_folio(folio, NULL);
+			folio_lock(folio);
+		} else {
+			folio_lock(folio);
+		}
+
+		if (!folio_matches_swap_entry(folio, vswap_entry)) {
+			folio_unlock(folio);
+			folio_put(folio);
+			continue;
+		}
+
+		phys = vswap_to_phys(vswap_entry);
+		if (!phys.val || swp_type(phys) != type) {
+			folio_unlock(folio);
+			folio_put(folio);
+			continue;
+		}
+
+		folio_wait_writeback(folio);
+		vswap_store_folio(vswap_entry, folio);
+		folio_mark_dirty(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	if (!swap_usage_in_pages(si))
+		goto success;
+
 	retval = shmem_unuse(type);
 	if (retval)
 		return retval;
@@ -3126,6 +3464,14 @@ static int try_to_unuse(unsigned int type)
 
 		entry = swp_entry(type, i);
 
+		if (IS_ENABLED(CONFIG_VSWAP)) {
+			swp_tb = swap_table_get(
+				__swap_offset_to_cluster(si, i),
+				i % SWAPFILE_CLUSTER);
+			if (swp_tb_is_pointer(swp_tb))
+				continue;
+		}
+
 		folio = swap_cache_get_folio(entry);
 		if (!folio)
 			continue;
diff --git a/mm/vswap.h b/mm/vswap.h
index 5e6e5b88593c..a3a84e27f819 100644
--- a/mm/vswap.h
+++ b/mm/vswap.h
@@ -24,6 +24,40 @@ static inline bool swap_is_vswap(struct swap_info_struct *si)
 
 extern struct swap_info_struct *vswap_si;
 
+/* Rmap cache-only helpers for physical cluster Pointer-tagged entries */
+
+static inline void swap_rmap_mark_cache_only(struct swap_cluster_info *ci,
+					     unsigned int off)
+{
+	atomic_long_t *table;
+
+	table = rcu_dereference_check(ci->table, true);
+	atomic_long_or(SWP_RMAP_CACHE_ONLY, &table[off]);
+}
+
+static inline void swap_rmap_clear_cache_only(struct swap_cluster_info *ci,
+					      unsigned int off)
+{
+	atomic_long_t *table;
+
+	table = rcu_dereference_check(ci->table, true);
+	atomic_long_and(~SWP_RMAP_CACHE_ONLY, &table[off]);
+}
+
+static inline bool swap_rmap_is_cache_only(struct swap_cluster_info *ci,
+					   unsigned int off)
+{
+	atomic_long_t *table;
+	bool ret;
+
+	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+	rcu_read_lock();
+	table = rcu_dereference(ci->table);
+	ret = table && (atomic_long_read(&table[off]) & SWP_RMAP_CACHE_ONLY);
+	rcu_read_unlock();
+	return ret;
+}
+
 /*
  * Virtual table entry encoding for vswap clusters.
  *
@@ -73,6 +107,20 @@ static inline unsigned long vtable_mk_none(void)
 	return 0;
 }
 
+static inline unsigned long vtable_mk_phys(swp_entry_t entry)
+{
+	return vtable_mk(VSWAP_SWAPFILE, entry.val);
+}
+
+static inline swp_entry_t vtable_to_phys(unsigned long vt)
+{
+	swp_entry_t entry;
+
+	VM_WARN_ON(vtable_type(vt) != VSWAP_SWAPFILE);
+	entry.val = vtable_payload(vt);
+	return entry;
+}
+
 static inline unsigned long vtable_mk_zero(void)
 {
 	return VSWAP_ZERO;
@@ -136,6 +184,27 @@ vswap_lock_cluster(swp_entry_t entry, unsigned int *voff)
 	return ci_dyn;
 }
 
+/* High-level vswap lookup */
+
+static inline swp_entry_t vswap_to_phys(swp_entry_t entry)
+{
+	struct swap_cluster_info_dynamic *ci_dyn;
+	unsigned int voff;
+	unsigned long vt;
+
+	ci_dyn = vswap_lock_cluster(entry, &voff);
+	if (!ci_dyn)
+		return (swp_entry_t){};
+
+	vt = __vtable_get(ci_dyn, voff);
+	spin_unlock(&ci_dyn->ci.lock);
+
+	if (vtable_type(vt) != VSWAP_SWAPFILE)
+		return (swp_entry_t){};
+
+	return vtable_to_phys(vt);
+}
+
 /* Zswap entry helpers — store/load/erase in virtual_table */
 
 void vswap_release_backing(struct swap_cluster_info *ci,
@@ -188,6 +257,7 @@ static inline int vswap_check_backing(swp_entry_t entry, int nr,
 	enum vswap_backing_type first_type;
 	unsigned int voff;
 	unsigned long vt;
+	swp_entry_t first_phys;
 	int i;
 
 	ci_dyn = vswap_lock_cluster(entry, &voff);
@@ -196,10 +266,16 @@ static inline int vswap_check_backing(swp_entry_t entry, int nr,
 
 	for (i = 0; i < nr; i++) {
 		vt = __vtable_get(ci_dyn, voff + i);
-		if (!i)
+		if (!i) {
 			first_type = vtable_type(vt);
-		else if (vtable_type(vt) != first_type)
+			if (first_type == VSWAP_SWAPFILE)
+				first_phys = vtable_to_phys(vt);
+		} else if (vtable_type(vt) != first_type) {
 			break;
+		} else if (first_type == VSWAP_SWAPFILE &&
+			   vtable_to_phys(vt).val != first_phys.val + i) {
+			break;
+		}
 	}
 	spin_unlock(&ci_dyn->ci.lock);
 
@@ -208,12 +284,20 @@ static inline int vswap_check_backing(swp_entry_t entry, int nr,
 	return i;
 }
 
+static inline bool vswap_swapfile_backed(swp_entry_t entry, int nr)
+{
+	enum vswap_backing_type type;
+
+	return vswap_check_backing(entry, nr, &type) == nr &&
+	       type == VSWAP_SWAPFILE;
+}
+
 static inline bool vswap_can_swapin_thp(swp_entry_t entry, int nr)
 {
 	enum vswap_backing_type type;
 
 	return vswap_check_backing(entry, nr, &type) == nr &&
-	       type == VSWAP_ZERO;
+	       (type == VSWAP_ZERO || type == VSWAP_SWAPFILE);
 }
 
 static inline int vswap_cluster_alloc_vtable(struct swap_cluster_info_dynamic *ci_dyn)
@@ -266,6 +350,22 @@ static inline void vswap_set_zero(struct swap_cluster_info *ci,
 
 #else /* !CONFIG_VSWAP */
 
+static inline swp_entry_t vswap_to_phys(swp_entry_t entry)
+{
+	return (swp_entry_t){};
+}
+
+static inline bool vswap_swapfile_backed(swp_entry_t entry, int nr)
+{
+	return false;
+}
+
+static inline bool swap_rmap_is_cache_only(struct swap_cluster_info *ci,
+					   unsigned int off)
+{
+	return false;
+}
+
 static inline void vswap_release_backing(struct swap_cluster_info *ci,
 					 unsigned int ci_start,
 					 unsigned int nr) {}
@@ -310,4 +410,36 @@ static inline void vswap_set_zero(struct swap_cluster_info *ci,
 				  unsigned int ci_off) {}
 
 #endif /* CONFIG_VSWAP */
+
+/*
+ * Test a per-backend swap flag (SWP_SYNCHRONOUS_IO, SWP_STABLE_WRITES, ...)
+ * for @entry. For a vswap entry the property belongs to the current
+ * physical backing, not vswap_si — resolve and test that. Returns false
+ * for zswap/zero/unbacked vswap entries: they don't go through bdev IO,
+ * so per-bdev flags don't apply.
+ */
+static inline bool swap_entry_backend_has_flag(struct swap_info_struct *si,
+					       swp_entry_t entry,
+					       unsigned long flag)
+{
+	struct swap_info_struct *phys_si;
+	swp_entry_t phys;
+	bool has_flag;
+
+	if (!swap_is_vswap(si))
+		return data_race(si->flags & flag);
+
+	phys = vswap_to_phys(entry);
+	if (!phys.val)
+		return false;
+
+	phys_si = get_swap_device(phys);
+	if (!phys_si)
+		return false;
+
+	has_flag = data_race(phys_si->flags & flag);
+	put_swap_device(phys_si);
+	return has_flag;
+}
+
 #endif /* _MM_VSWAP_H */
diff --git a/mm/zswap.c b/mm/zswap.c
index c57bf0246bb2..85622af0df5c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -993,6 +993,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct folio *folio;
 	struct mempolicy *mpol;
 	struct swap_info_struct *si;
+	swp_entry_t phys = {};
 	int ret = 0;
 
 	/* try to allocate swap cache folio */
@@ -1000,16 +1001,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	if (!si)
 		return -EEXIST;
 
-	/*
-	 * Vswap entries have no physical backing — writeback would fail
-	 * and SIGBUS the caller. Bail before we waste a swap-cache folio
-	 * allocation.
-	 */
-	if (si->flags & SWP_VSWAP) {
-		put_swap_device(si);
-		return -EINVAL;
-	}
-
 	mpol = get_task_policy(current);
 	folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
 				       NO_INTERLEAVE_INDEX);
@@ -1028,31 +1019,57 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	/*
 	 * folio is locked, and the swapcache is now secured against
 	 * concurrent swapping to and from the slot, and concurrent
-	 * swapoff so we can safely dereference the zswap tree here.
-	 * Verify that the swap entry hasn't been invalidated and recycled
-	 * behind our backs, to avoid overwriting a new swap folio with
-	 * old compressed data. Only when this is successful can the entry
-	 * be dereferenced.
+	 * swapoff so we can safely dereference the zswap tree (or vswap
+	 * vtable) here. Verify that the swap entry hasn't been
+	 * invalidated and recycled behind our backs, to avoid overwriting
+	 * a new swap folio with old compressed data. Only when this is
+	 * successful can the entry be dereferenced.
 	 */
-	tree = swap_zswap_tree(swpentry);
-	if (entry != xa_load(tree, offset)) {
-		ret = -ENOMEM;
-		goto out;
+	if (swap_is_vswap(si)) {
+		if (entry != vswap_zswap_load(swpentry)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * Allocate physical backing BEFORE decompress — if it fails,
+		 * no wasted work. folio_realloc_swap sets vtable to PHYS,
+		 * overwriting ZSWAP — the old entry pointer is only held
+		 * by the caller now.
+		 */
+		phys = folio_realloc_swap(folio);
+		if (!phys.val) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else {
+		tree = swap_zswap_tree(swpentry);
+		if (entry != xa_load(tree, offset)) {
+			ret = -ENOMEM;
+			goto out;
+		}
 	}
 
 	if (!zswap_decompress(entry, folio)) {
 		ret = -EIO;
+		/*
+		 * For vswap: folio_realloc_swap already moved the entry
+		 * out of the vtable. Restore it via vswap_zswap_store so
+		 * the entry stays tracked (and the just-allocated PHYS
+		 * slot is freed). For non-vswap: entry is still in the
+		 * zswap tree.
+		 */
+		if (swap_is_vswap(si) && phys.val)
+			vswap_zswap_store(swpentry, entry);
 		goto out;
 	}
 
-	xa_erase(tree, offset);
+	if (!swap_is_vswap(si))
+		xa_erase(tree, offset);
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
 		count_objcg_events(entry->objcg, ZSWPWB, 1);
 
-	zswap_entry_free(entry);
-
 	/* folio is up to date */
 	folio_mark_uptodate(folio);
 
@@ -1060,8 +1077,22 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	folio_set_reclaim(folio);
 
 	/* start writeback */
-	ret = __swap_writepage(folio, NULL);
-	WARN_ON_ONCE(ret);
+	if (swap_is_vswap(si)) {
+		ret = __swap_writepage_phys(folio, NULL, phys);
+		WARN_ON_ONCE(ret);
+	} else {
+		ret = __swap_writepage(folio, NULL);
+		WARN_ON_ONCE(ret);
+	}
+
+	/*
+	 * __swap_writepage{,_phys} always returns 0 today — async IO
+	 * errors surface in the bio end_io callback, not synchronously
+	 * here. Either way, the entry has been moved out of its prior
+	 * location (vtable PHYS for vswap, removed from tree for not),
+	 * so we own the free.
+	 */
+	zswap_entry_free(entry);
 
 out:
 	if (ret) {
-- 
2.53.0-Meta


^ permalink raw reply related

* [RFC PATCH 4/5] mm, swap: only charge physical swap entries
From: Nhat Pham @ 2026-05-28 21:29 UTC (permalink / raw)
  To: kasong
  Cc: Liam.Howlett, akpm, apopple, axelrasmussen, baohua, baolin.wang,
	bhe, byungchul, cgroups, chengming.zhou, chrisl, corbet, david,
	dev.jain, gourry, hannes, hughd, jannh, joshua.hahnjy, lance.yang,
	lenb, linux-doc, linux-kernel, linux-mm, linux-pm,
	lorenzo.stoakes, matthew.brost, mhocko, muchun.song, npache,
	nphamcs, pavel, peterx, peterz, pfalcato, rafael, rakie.kim,
	roman.gushchin, rppt, ryan.roberts, shakeel.butt, shikemeng,
	surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed, yuanchu,
	zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>

Stop double-charging vswap entries against memcg->swap. Previously,
the entry was charged once at vswap allocation (via
mem_cgroup_try_charge_swap) and implicitly again when physical
backing was allocated.

Split the lifecycle into four operations: record the memcg private
ID at vswap alloc without charging; charge memcg->swap only when
physical backing is allocated via folio_realloc_swap; uncharge in
vswap_release_backing (only nr_swapfile entries on v2, all nr on
v1 memsw); and drop the ID ref at __swap_cluster_free_entries
without uncharging.

Direct-mapped physical swap charging is unchanged.

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/swap.h |  57 +++++++++++++++++++++
 mm/memcontrol.c      | 118 +++++++++++++++++++++++++++++++++++++++++++
 mm/swapfile.c        | 109 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 276 insertions(+), 8 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3fb55485fc76..6f18ecdf0bb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -597,6 +597,43 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
 	return __mem_cgroup_try_charge_swap(folio);
 }
 
+extern void __mem_cgroup_record_swap(struct folio *folio);
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_record_swap(folio);
+}
+
+extern int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+					 unsigned int nr_pages);
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+					      unsigned int nr_pages)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+	return __mem_cgroup_charge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+					    unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+						 unsigned int nr_pages)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge_backing_phys_swap(memcg, nr_pages);
+}
+
+extern void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+					  unsigned int nr_pages)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_id_put_swap(id, nr_pages);
+}
+
 extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
 static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
 {
@@ -613,6 +650,26 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
 	return 0;
 }
 
+static inline void mem_cgroup_record_swap(struct folio *folio)
+{
+}
+
+static inline int mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+					      unsigned int nr_pages)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+						 unsigned int nr_pages)
+{
+}
+
+static inline void mem_cgroup_id_put_swap(unsigned short id,
+					  unsigned int nr_pages)
+{
+}
+
 static inline void mem_cgroup_uncharge_swap(unsigned short id,
 					    unsigned int nr_pages)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7492879b3239..91618da7ec20 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5513,6 +5513,124 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
 	return 0;
 }
 
+/**
+ * __mem_cgroup_record_swap - record memcg for swap without charging
+ * @folio: folio being added to swap
+ *
+ * Pin the memcg private ID ref and record it in the swap cgroup table,
+ * but do not charge memcg->swap. Used for vswap entries where the charge
+ * is deferred until physical backing is allocated.
+ */
+void __mem_cgroup_record_swap(struct folio *folio)
+{
+	unsigned int nr_pages = folio_nr_pages(folio);
+	struct swap_cluster_info *ci;
+	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
+
+	if (do_memsw_account())
+		return;
+
+	objcg = folio_objcg(folio);
+	if (!objcg)
+		return;
+
+	rcu_read_lock();
+	memcg = obj_cgroup_memcg(objcg);
+	if (!folio_test_swapcache(folio)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	memcg = mem_cgroup_private_id_get_online(memcg, nr_pages);
+	rcu_read_unlock();
+
+	ci = swap_cluster_get_and_lock(folio);
+	__swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+			  mem_cgroup_private_id(memcg));
+	swap_cluster_unlock(ci);
+}
+
+/**
+ * __mem_cgroup_charge_backing_phys_swap - charge memcg->swap counter only
+ * @memcg: the mem_cgroup to charge (may be NULL)
+ * @nr_pages: number of physical swap pages to charge
+ *
+ * Unlike __mem_cgroup_try_charge_swap(), this does NOT touch the memcg
+ * private ID refcount — the ID ref was pinned earlier by
+ * __mem_cgroup_record_swap() at vswap allocation time and lives for the
+ * lifetime of the vswap entry. This helper only updates the swap counter
+ * when a vswap entry transitions to physical backing (folio_realloc_swap),
+ * so the counter and the ID ref can be managed independently.
+ *
+ * The caller resolves the memcg (typically via folio_memcg + ID
+ * comparison to avoid IDR lookups on the hot path).
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int __mem_cgroup_charge_backing_phys_swap(struct mem_cgroup *memcg,
+				  unsigned int nr_pages)
+{
+	struct page_counter *counter;
+
+	if (do_memsw_account())
+		return 0;
+	if (!memcg)
+		return 0;
+
+	if (!mem_cgroup_is_root(memcg) &&
+	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+		return -ENOMEM;
+	}
+	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+	return 0;
+}
+
+/**
+ * __mem_cgroup_uncharge_backing_phys_swap - uncharge memcg->swap counter only
+ * @memcg: the mem_cgroup to uncharge (may be NULL)
+ * @nr_pages: number of physical swap pages to uncharge
+ *
+ * Unlike __mem_cgroup_uncharge_swap(), this does NOT drop the memcg
+ * private ID refcount — that ref is dropped separately via
+ * __mem_cgroup_id_put_swap() when the vswap entry itself is freed.
+ * This helper only updates the swap counter when physical backing is
+ * released (vswap_release_backing), so the counter and ID ref can be
+ * managed independently.
+ */
+void __mem_cgroup_uncharge_backing_phys_swap(struct mem_cgroup *memcg,
+				     unsigned int nr_pages)
+{
+	if (!memcg)
+		return;
+
+	if (!mem_cgroup_is_root(memcg)) {
+		if (do_memsw_account())
+			page_counter_uncharge(&memcg->memsw, nr_pages);
+		else
+			page_counter_uncharge(&memcg->swap, nr_pages);
+	}
+	mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
+}
+
+/**
+ * __mem_cgroup_id_put_swap - drop memcg private ID ref without uncharging
+ * @id: cgroup private id
+ * @nr_pages: number of refs to drop
+ */
+void __mem_cgroup_id_put_swap(unsigned short id, unsigned int nr_pages)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_private_id(id);
+	if (memcg)
+		mem_cgroup_private_id_put(memcg, nr_pages);
+	rcu_read_unlock();
+}
+
 /**
  * __mem_cgroup_uncharge_swap - uncharge swap space
  * @id: cgroup id to uncharge
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a0976be6a12b..be901fb741e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include "memcontrol-v1.h"
 #include <linux/poll.h>
 #include <linux/oom.h>
 #include <linux/swapfile.h>
@@ -2043,8 +2044,15 @@ int folio_alloc_swap(struct folio *folio)
 			goto again;
 	}
 
-	/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
-	if (unlikely(mem_cgroup_try_charge_swap(folio)))
+	/*
+	 * Vswap entries: record memcg ID without charging — the charge is
+	 * deferred to folio_realloc_swap when physical backing is allocated.
+	 * Direct-mapped physical swap entries: charge immediately as today.
+	 */
+	if (folio_test_swapcache(folio) &&
+	    swap_is_vswap(__swap_entry_to_info(folio->swap)))
+		mem_cgroup_record_swap(folio);
+	else if (unlikely(mem_cgroup_try_charge_swap(folio)))
 		swap_cache_del_folio(folio);
 
 	if (unlikely(!folio_test_swapcache(folio)))
@@ -2096,6 +2104,26 @@ static void __swap_cluster_free_phys_backing(struct swap_info_struct *psi,
 					     unsigned int ci_start,
 					     unsigned int nr_pages);
 
+static void vswap_uncharge_cgroup_batch(unsigned short memcg_id,
+					unsigned int batch_nr,
+					unsigned int batch_nr_swapfile)
+{
+	struct mem_cgroup *memcg;
+	unsigned int n;
+
+	if (do_memsw_account())
+		n = batch_nr;
+	else
+		n = batch_nr_swapfile;
+	if (!n)
+		return;
+
+	rcu_read_lock();
+	memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+	rcu_read_unlock();
+	mem_cgroup_uncharge_backing_phys_swap(memcg, n);
+}
+
 void vswap_release_backing(struct swap_cluster_info *ci,
 			   unsigned int ci_start, unsigned int nr)
 {
@@ -2106,12 +2134,36 @@ void vswap_release_backing(struct swap_cluster_info *ci,
 	unsigned int ci_off;
 	unsigned long vt;
 	swp_entry_t phys;
+	/*
+	 * Per-cgroup uncharge batching: a single vswap_release_backing
+	 * call can span multiple cgroups (e.g. batched free across
+	 * folios), so we cannot uncharge with the first slot's memcg
+	 * for the whole range.
+	 */
+	unsigned short batch_id;
+	unsigned int batch_nr = 0, batch_nr_swapfile = 0;
 
 	lockdep_assert_held(&ci->lock);
 	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+	batch_id = __swap_cgroup_get(ci, ci_start);
 
 	for (ci_off = ci_start; ci_off < ci_start + nr; ci_off++) {
+		unsigned short cur_id;
+
 		vt = __vtable_get(ci_dyn, ci_off);
+		cur_id = __swap_cgroup_get(ci, ci_off);
+
+		/*
+		 * Flush per-cgroup uncharge when crossing a cgroup boundary.
+		 */
+		if (cur_id != batch_id) {
+			vswap_uncharge_cgroup_batch(batch_id, batch_nr,
+						    batch_nr_swapfile);
+			batch_id = cur_id;
+			batch_nr = 0;
+			batch_nr_swapfile = 0;
+		}
+		batch_nr++;
 
 		/*
 		 * Flush batched physical slots when the next entry
@@ -2135,6 +2187,7 @@ void vswap_release_backing(struct swap_cluster_info *ci,
 
 		switch (vtable_type(vt)) {
 		case VSWAP_SWAPFILE:
+			batch_nr_swapfile++;
 			if (!phys_start) {
 				phys = vtable_to_phys(vt);
 				phys_start = swp_offset(phys);
@@ -2165,6 +2218,9 @@ void vswap_release_backing(struct swap_cluster_info *ci,
 			phys_start % SWAPFILE_CLUSTER,
 			phys_end - phys_start);
 	}
+
+	/* Final cgroup-batch flush. */
+	vswap_uncharge_cgroup_batch(batch_id, batch_nr, batch_nr_swapfile);
 }
 
 void vswap_store_folio(swp_entry_t entry, struct folio *folio)
@@ -2222,7 +2278,9 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
 	swp_entry_t vswap_entry = folio->swap;
 	struct swap_cluster_info *ci;
 	struct swap_cluster_info_dynamic *ci_dyn;
+	struct mem_cgroup *memcg;
 	unsigned int voff;
+	unsigned short memcg_id;
 	swp_entry_t phys_entry = {};
 	swp_entry_t pe;
 	int i, nr = folio_nr_pages(folio);
@@ -2245,9 +2303,33 @@ swp_entry_t folio_realloc_swap(struct folio *folio)
 		return (swp_entry_t){};
 
 	voff = swp_cluster_offset(vswap_entry);
-
 	ci = __swap_entry_to_cluster(vswap_entry);
 	ci_dyn = container_of(ci, struct swap_cluster_info_dynamic, ci);
+
+	/*
+	 * Resolve the memcg for physical swap charging. Compare
+	 * folio_memcg against the recorded swap memcg ID — on match
+	 * (common case), zero IDR lookups. Only fall back to IDR
+	 * lookup on mismatch (task migrated cgroups).
+	 */
+	spin_lock(&ci->lock);
+	memcg_id = __swap_cgroup_get(ci, voff);
+	spin_unlock(&ci->lock);
+
+	rcu_read_lock();
+	memcg = folio_memcg(folio);
+	if (!memcg || mem_cgroup_private_id(memcg) != memcg_id)
+		memcg = memcg_id ? mem_cgroup_from_private_id(memcg_id) : NULL;
+	rcu_read_unlock();
+
+	if (mem_cgroup_charge_backing_phys_swap(memcg, nr)) {
+		__swap_cluster_free_phys_backing(
+			__swap_entry_to_info(phys_entry),
+			__swap_entry_to_cluster(phys_entry),
+			swp_cluster_offset(phys_entry), nr);
+		return (swp_entry_t){};
+	}
+
 	spin_lock(&ci->lock);
 	/*
 	 * Install PHYS backing without freeing any prior contents of the
@@ -2468,10 +2550,11 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 	unsigned short batch_id = 0, id_cur;
 	unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
 	unsigned int batch_off = ci_off;
+	bool is_vswap = swap_is_vswap(si);
 
 	VM_WARN_ON(ci->count < nr_pages);
 
-	if (swap_is_vswap(si))
+	if (is_vswap)
 		vswap_release_backing(ci, ci_start, nr_pages);
 
 	ci->count -= nr_pages;
@@ -2491,18 +2574,28 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 		/*
 		 * Uncharge swap slots by memcg in batches. Consecutive
 		 * slots with the same cgroup id are uncharged together.
+		 * For vswap, only drop the ID ref — physical swap was
+		 * already uncharged in vswap_release_backing above.
 		 */
 		id_cur = __swap_cgroup_clear(ci, ci_off, 1);
 		if (batch_id != id_cur) {
-			if (batch_id)
-				mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+			if (batch_id) {
+				if (is_vswap)
+					mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+				else
+					mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+			}
 			batch_id = id_cur;
 			batch_off = ci_off;
 		}
 	} while (++ci_off < ci_end);
 
-	if (batch_id)
-		mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+	if (batch_id) {
+		if (is_vswap)
+			mem_cgroup_id_put_swap(batch_id, ci_off - batch_off);
+		else
+			mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+	}
 
 	__swap_cluster_finish_free(si, ci, ci_start, nr_pages);
 }
-- 
2.53.0-Meta


^ permalink raw reply related

* [RFC PATCH 5/5] mm, swap: add debugfs counters for vswap
From: Nhat Pham @ 2026-05-28 21:29 UTC (permalink / raw)
  To: kasong
  Cc: Liam.Howlett, akpm, apopple, axelrasmussen, baohua, baolin.wang,
	bhe, byungchul, cgroups, chengming.zhou, chrisl, corbet, david,
	dev.jain, gourry, hannes, hughd, jannh, joshua.hahnjy, lance.yang,
	lenb, linux-doc, linux-kernel, linux-mm, linux-pm,
	lorenzo.stoakes, matthew.brost, mhocko, muchun.song, npache,
	nphamcs, pavel, peterx, peterz, pfalcato, rafael, rakie.kim,
	roman.gushchin, rppt, ryan.roberts, shakeel.butt, shikemeng,
	surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed, yuanchu,
	zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <20260528212955.1912856-1-nphamcs@gmail.com>

Add /sys/kernel/debug/vswap/ with two counters:

- used: number of virtual swap slots currently allocated
- alloc_reject: cumulative count of failed vswap allocations

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 mm/swapfile.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index be901fb741e5..3740ab764405 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/debugfs.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task.h>
@@ -132,6 +133,9 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
 	.lock = INIT_LOCAL_LOCK(),
 };
 
+static atomic_t __maybe_unused vswap_used = ATOMIC_INIT(0);
+static atomic_t __maybe_unused vswap_alloc_reject = ATOMIC_INIT(0);
+
 #ifdef CONFIG_VSWAP
 struct percpu_vswap_cluster {
 	unsigned long offset[SWAP_NR_ORDERS];
@@ -1993,11 +1997,13 @@ static bool vswap_alloc(struct folio *folio)
 	if (folio_test_swapcache(folio)) {
 		/* alloc_swap_scan_cluster updated percpu offset already */
 		local_unlock(&percpu_vswap_cluster.lock);
+		atomic_add(folio_nr_pages(folio), &vswap_used);
 		return true;
 	}
 
 	this_cpu_write(percpu_vswap_cluster.offset[order], SWAP_ENTRY_INVALID);
 	local_unlock(&percpu_vswap_cluster.lock);
+	atomic_add(folio_nr_pages(folio), &vswap_alloc_reject);
 	return false;
 }
 #endif
@@ -2554,8 +2560,10 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
 
 	VM_WARN_ON(ci->count < nr_pages);
 
-	if (is_vswap)
+	if (is_vswap) {
 		vswap_release_backing(ci, ci_start, nr_pages);
+		atomic_sub(nr_pages, &vswap_used);
+	}
 
 	ci->count -= nr_pages;
 	do {
@@ -4793,6 +4801,7 @@ struct swap_info_struct *vswap_si;
 static int __init vswap_init(void)
 {
 	struct swap_info_struct *si;
+	struct dentry *root;
 	unsigned long maxpages;
 	int err;
 
@@ -4819,6 +4828,11 @@ static int __init vswap_init(void)
 	mutex_unlock(&swapon_mutex);
 
 	vswap_si = si;
+
+	root = debugfs_create_dir("vswap", NULL);
+	debugfs_create_atomic_t("used", 0444, root, &vswap_used);
+	debugfs_create_atomic_t("alloc_reject", 0444, root, &vswap_alloc_reject);
+
 	pr_info("vswap: created virtual swap device (%lu pages)\n", maxpages);
 	return 0;
 
-- 
2.53.0-Meta


^ permalink raw reply related

* Re: [PATCH v6 01/22] mm/swap: decouple swap cache from physical swap infrastructure
From: Nhat Pham @ 2026-05-28 21:42 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: kasong, Liam.Howlett, akpm, apopple, axelrasmussen, baohua,
	baolin.wang, bhe, byungchul, cgroups, chengming.zhou, chrisl,
	corbet, david, dev.jain, gourry, hannes, hughd, jannh,
	joshua.hahnjy, lance.yang, lenb, linux-doc, linux-kernel,
	linux-mm, linux-pm, lorenzo.stoakes, matthew.brost, mhocko,
	muchun.song, npache, pavel, peterx, peterz, pfalcato, rafael,
	rakie.kim, roman.gushchin, rppt, ryan.roberts, shakeel.butt,
	shikemeng, surenb, tglx, vbabka, weixugc, ying.huang, yosry.ahmed,
	yuanchu, zhengqi.arch, ziy, kernel-team, riel, haowenchao22
In-Reply-To: <agJcCZuLqWwU_sSR@google.com>

On Mon, May 11, 2026 at 3:46 PM Yosry Ahmed <yosry@kernel.org> wrote:
>
> On Tue, May 05, 2026 at 08:38:30AM -0700, Nhat Pham wrote:
> > When we virtualize the swap space, we will manage swap cache at the
> > virtual swap layer. To prepare for this, decouple swap cache from
> > physical swap infrastructure.
> >
> > We will also remove all the swap cache related helpers of swap table. We
> > will keep the rest of the swap table infrastructure, which will be
> > repurposed to serve as the rmap (physical -> virtual swap mapping)
> > later.
>
> I didn't look through the entire series, but let me ask the same
> high-level question I asked before. Instead of moving things out of the
> swap table, why not reuse the swap table as the representation of the
> virtual swap space? Seems like most/all metadata is already moved there
> in a nice concise format.

The honest answer is I wasn't sure it would work, so I was hacking
quietly a prototype on my own time :)

I finally got something that survives stress-ng and constant
memory.reclaim thrown at it though. I figured I should send it out to
get feedback before digging myself deeper into that hole:

https://lore.kernel.org/all/20260528212955.1912856-1-nphamcs@gmail.com/

There is still a small problem left (the metadata duplication issue
that Johannes brought up). It is potentially fixable, but I haven't
actually tried it out yet, so I don't want to overstate here. But take
a look at it and let me know how you feel about this alternative
approach!

^ permalink raw reply

* [tj-cgroup:for-7.1-fixes] BUILD SUCCESS 645c3b7ef1a7eed9627664bd11d7a8eb4519ee15
From: kernel test robot @ 2026-05-29  1:59 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-7.1-fixes
branch HEAD: 645c3b7ef1a7eed9627664bd11d7a8eb4519ee15  cgroup/cpuset: Add test cases for sibling CPU exclusion on partition update

elapsed time: 1853m

configs tested: 71
configs skipped: 3

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alpha                   allnoconfig    gcc-15.2.0
alpha                  allyesconfig    gcc-15.2.0
arc                    allmodconfig    gcc-15.2.0
arc                     allnoconfig    gcc-15.2.0
arc                    allyesconfig    gcc-15.2.0
arm                     allnoconfig    clang-23
arm                    allyesconfig    gcc-15.2.0
arm64                  allmodconfig    clang-19
arm64                   allnoconfig    gcc-15.2.0
csky                   allmodconfig    gcc-15.2.0
csky                    allnoconfig    gcc-15.2.0
hexagon                allmodconfig    clang-17
hexagon                 allnoconfig    clang-23
i386                    allnoconfig    gcc-14
i386                   allyesconfig    gcc-14
loongarch              allmodconfig    clang-19
loongarch               allnoconfig    clang-23
m68k                   allmodconfig    gcc-15.2.0
m68k                    allnoconfig    gcc-15.2.0
m68k                   allyesconfig    gcc-15.2.0
microblaze              allnoconfig    gcc-15.2.0
microblaze             allyesconfig    gcc-15.2.0
mips                   allmodconfig    gcc-15.2.0
mips                    allnoconfig    gcc-15.2.0
mips                   allyesconfig    gcc-15.2.0
nios2                  allmodconfig    gcc-11.5.0
nios2                   allnoconfig    gcc-11.5.0
openrisc               allmodconfig    gcc-15.2.0
openrisc                allnoconfig    gcc-15.2.0
openrisc                  defconfig    gcc-15.2.0
parisc                 allmodconfig    gcc-15.2.0
parisc                  allnoconfig    gcc-15.2.0
parisc                 allyesconfig    gcc-15.2.0
parisc                    defconfig    gcc-15.2.0
powerpc                allmodconfig    gcc-15.2.0
powerpc                 allnoconfig    gcc-15.2.0
riscv                  allmodconfig    clang-23
riscv                   allnoconfig    gcc-15.2.0
riscv                  allyesconfig    clang-16
riscv                     defconfig    clang-23
riscv       randconfig-001-20260529    gcc-8.5.0
riscv       randconfig-002-20260529    gcc-9.5.0
s390                   allmodconfig    clang-18
s390                    allnoconfig    clang-23
s390                   allyesconfig    gcc-15.2.0
s390                      defconfig    clang-23
s390        randconfig-001-20260529    gcc-8.5.0
s390        randconfig-002-20260529    clang-23
sh                     allmodconfig    gcc-15.2.0
sh                      allnoconfig    gcc-15.2.0
sh                     allyesconfig    gcc-15.2.0
sh          randconfig-001-20260529    gcc-15.2.0
sh          randconfig-002-20260529    gcc-15.2.0
sparc                   allnoconfig    gcc-15.2.0
sparc                     defconfig    gcc-15.2.0
sparc       randconfig-001-20260529    gcc-8.5.0
sparc       randconfig-002-20260529    gcc-15.2.0
sparc64                allmodconfig    clang-23
sparc64     randconfig-001-20260529    gcc-11.5.0
um                     allmodconfig    clang-19
um                      allnoconfig    clang-23
um                     allyesconfig    gcc-14
um          randconfig-001-20260529    gcc-14
um          randconfig-002-20260529    gcc-14
x86_64                 allmodconfig    clang-20
x86_64                  allnoconfig    clang-20
x86_64                 allyesconfig    clang-20
x86_64                rhel-9.4-rust    clang-20
xtensa                  allnoconfig    gcc-15.2.0
xtensa      randconfig-001-20260529    gcc-11.5.0
xtensa      randconfig-002-20260529    gcc-8.5.0

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH-next v3 3/5] cgroup/cpuset: Made cpuset_attach_old_cs track task group leaders
From: Guopeng Zhang @ 2026-05-29  2:19 UTC (permalink / raw)
  To: Waiman Long, Chen Ridong, Tejun Heo, Johannes Weiner,
	Michal Koutný, Ingo Molnar, Peter Zijlstra
  Cc: cgroups, linux-kernel, Aaron Tomlin, Ridong Chen
In-Reply-To: <20260527153800.1557449-4-longman@redhat.com>



在 2026/5/27 23:37, Waiman Long 写道:
> There are two possible ways that migration of tasks from multiple source
> cpusets to a target cpuset can happen. Either a multithread application
> with threads in different cpusets is wholely moved to a new cpuset
> or disabling of v2 cpuset controller will move all the tasks in child
> cpusets to the parent cpuset.
> 
> In the former case, t is the mm setting of the group leader that really
> matters. So cpuset_attach_old_cs should track the oldcs of the thread
> leader. In the latter case, effective_mems of child cpusets must always
> be a subset of the parent. So no real page migration will be necessary
> no matter which child cpuset is selected as cpuset_attach_old_cs.
> 
> IOW, cpuset_attach_old_cs should be updated to match the latest task
> group leader in cpuset_can_attach().
> 
> Suggested-by: Ridong Chen <ridong.chen@linux.dev>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  kernel/cgroup/cpuset.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 4457c4f11fce..b233a71f9b7c 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -2967,6 +2967,20 @@ static int update_prstate(struct cpuset *cs, int new_prs)
>  /*
>   * cpuset_can_attach() and cpuset_attach() specific internal data
>   * Protected by cpuset_mutex
> + *
> + * The cpuset_attach_old_cs is used mainly by cpuset_migrate_mm() tp get the
> + * old_mems_allowed value. There are two ways that many-to-one cpuset migration
> + * can happen:
Hi Waiman,

I applied this series locally and ran some of my test cases. I didn't
observe any issue so far.

While doing a static/checkpatch pass, I noticed a few minor issues in
patches 3, 4 and 5. They are all non-functional nits.

For this patch, I only noticed a couple of small wording/typo nits in
the new comment:

s/tp get/to get/

Best,
Guopeng
> + * 1) A multithread application with threads in different cpusets is wholely
> + *    moved to a new cpuset.
> + * 2) Disabling v2 cpuset controller will move all the tasks in child cpusets
> + *    to the parent cpuset.
> + *
> + * In the former case, it is the mm setting of the group leader that really
> + * matters. So cpuset_attach_old_cs should track the oldcs of the thread
> + * leader. In the latter case, effective_mems of child cpusets must always
> + * be a subset of the parent. So no real page migration will be necessary no
> + * matter which child cpuset is selected as cpuset_attach_old_cs.
>   */
>  static struct cpuset *cpuset_attach_old_cs;
>  static bool attach_cpus_updated;
> @@ -3069,6 +3083,10 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>  		if (ret)
>  			goto out_unlock;
>  
> +		/* Update cpuset_attach_old_cs to the latest group leader */
> +		if (task == task->group_leader)
> +			cpuset_attach_old_cs = task_cs(task);
> +
>  		if (setsched_check) {
>  			ret = security_task_setscheduler(task);
>  			if (ret)


^ permalink raw reply

* Re: [PATCH-next v3 4/5] cgroup/cpuset: Move mpol_rebind_mm/cpuset_migrate_mm() calls inside cpuset_attach_task()
From: Guopeng Zhang @ 2026-05-29  2:21 UTC (permalink / raw)
  To: Waiman Long, Chen Ridong, Tejun Heo, Johannes Weiner,
	Michal Koutný, Ingo Molnar, Peter Zijlstra
  Cc: cgroups, linux-kernel, Aaron Tomlin
In-Reply-To: <20260527153800.1557449-5-longman@redhat.com>



在 2026/5/27 23:37, Waiman Long 写道:
> The cpuset_attach_task() was introduced in commit 42a11bf5c543
> ("cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly")
> to enable the CLONE_INTO_CGROUP flag of clone(2) to behave more like
> moving a task from one cpuset into another one. That commits didn't
> move the mpol_rebind_mm() and cpuset_migrate_mm() calls for group leader
> into cpuset_attach_task().
> 
> When the CLONE_INTO_CGROUP flag is used without CLONE_THREAD, the new
> task is its own group leader. So it is still not equivalent to moving
> task between cpusets in this case. Make CLONE_INTO_CGROUP behaves
> more close to cpuset_attach() by moving the mpol_rebind_mm() and
> cpuset_migrate_mm() calls inside cpuset_attach_task(). As a result,
> cpuset_attach_old_cs, attach_cpus_updated and attach_mems_updated will
> also need to be updated in cpuset_fork().
> 
> Besides, the original code use cpuset_attach_nodemask_to for
> both nodemask returned by guarantee_online_mems() used only by
> cpuset_change_task_nodemask() and cs->effective_mems in all other cases.
> Such dual use is now impractical by merging the two task iteration loops
> into one. So keep cpuset_attach_nodemask_to for the nodemask returned
> by guarantee_online_mems() and reference cs->effective_mems directly
> in all the other cases.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  kernel/cgroup/cpuset.c | 90 ++++++++++++++++++++++--------------------
>  1 file changed, 47 insertions(+), 43 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index b233a71f9b7c..7100575927f6 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -3149,9 +3149,12 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>   */
>  static cpumask_var_t cpus_attach;
>  static nodemask_t cpuset_attach_nodemask_to;
> +static bool queue_task_work;
...
> @@ -3686,7 +3685,12 @@ static void cpuset_fork(struct task_struct *task)
>  	/* CLONE_INTO_CGROUP */
>  	mutex_lock(&cpuset_mutex);
>  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
> +	/* Assume CPUs and memory nodes are updated */
> +	attach_cpus_updated = attach_mems_updated = true;
> +	cpuset_attach_old_cs = oldcs;
> +	oldcs->old_mems_allowed = oldcs->effective_mems;
>  	cpuset_attach_task(cs, task);
> +	attach_cpus_updated = attach_mems_updated = false;
>  
>  	dec_attach_in_progress_locked(cs);
>  	mutex_unlock(&cpuset_mutex);
Just a minor nit while running checkpatch --strict on this patch:

checkpatch reports:

CHECK: multiple assignments should be avoided

Perhaps the multiple assignments can be split to keep the patch
checkpatch-clean?

attach_cpus_updated = true;
attach_mems_updated = true;

and later:

attach_cpus_updated = false;
attach_mems_updated = false;

Just a style nit.

Best,
Guopeng

^ permalink raw reply

* Re: [PATCH-next v3 5/5] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach()
From: Guopeng Zhang @ 2026-05-29  2:26 UTC (permalink / raw)
  To: Waiman Long, Chen Ridong, Tejun Heo, Johannes Weiner,
	Michal Koutný, Ingo Molnar, Peter Zijlstra
  Cc: cgroups, linux-kernel, Aaron Tomlin
In-Reply-To: <20260527153800.1557449-6-longman@redhat.com>



在 2026/5/27 23:38, Waiman Long 写道:
> With cgroup v2, the cgroup_taskset structure passed into the cgroup
> can_attach() and attach() methods can contain task migration data with
> multiple destination or source cpusets when the cpuset controller is
> enabled or disabled respectively.
...
> -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
> +/*
> + * Called by cgroups to determine if a cpuset is usable; cpuset_mutex held.
> + *
> + * With cgroup v2, enabling of cpuset controller in a cgroup subtree can
> + * cause @tset to contain task migration data from one parent cpuset to multiple
> + * child cpusets. Not much is needed to be done here other than tracking the
> + * number of DL tasks in each cpuset as the CPUs and memory nodes of the child
> + * cpusets are exactly the same as the parent.
> + *
> + * Conversely, disabling of cpuset controller can cause @tset to contain task
> + * migration data from multiple child cpusets to one parent cpuset. Here, the
> + * CPUs and memory nodes of the child cpusets may be different from the parent,
> + * but must be a subset of its parent.
> + *
> + * Another possible many-to-one migration is the moving of the whole
> + * multithreaded process with threads in different cpusets to another cpuset.
> + *
> + * For all other use cases, @tset task migration data should be from one source
> + * cpuset to one destination cpuset.
> + */
>  static int cpuset_can_attach(struct cgroup_taskset *tset)
>  {
>  	struct cgroup_subsys_state *css;
> @@ -3079,6 +3172,16 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>  		goto out_unlock;
>  
>  	cgroup_taskset_for_each(task, css, tset) {
> +		struct cpuset *newcs = css_cs(css);
> +		struct cpuset *new_oldcs = task_cs(task);
> +
> +		if ((newcs != cs) || (new_oldcs != oldcs)) {
> +			cs = newcs;
> +			oldcs = new_oldcs;
> +			ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
> +			if (ret)
> +				goto out_unlock;
> +		}
Just a minor nit while running checkpatch --strict on this patch:

checkpatch reports unnecessary parentheses here:

if ((newcs != cs) || (new_oldcs != oldcs)) {

Perhaps this can be simplified to:

if (newcs != cs || new_oldcs != oldcs) {
>  		ret = task_can_attach(task);
>  		if (ret)
...
>  	/*
>  	 * In the default hierarchy, enabling cpuset in the child cgroups
> -	 * will trigger a number of cpuset_attach() calls with no change
> -	 * in effective cpus and mems. In that case, we can optimize out
> -	 * by skipping the task iteration and update.
> +	 * will trigger a cpuset_attach() call with no change in effective cpus
> +	 * and mems. In that case, we can optimize out by skipping the task
> +	 * iteration and update, but the destination cpuset list is iterated to
> +	 * set old_mems_sllowed.
>  	 */
I also noticed one small typo in the added comment:

s/old_mems_sllowed/old_mems_allowed/

Best,
Guopeng



^ permalink raw reply

* [linux-next:master] BUILD REGRESSION f7af91adc230aa99e23330ecf85bc9badd9780ad
From: kernel test robot @ 2026-05-29  6:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Memory Management List, cgroups, netdev, rust-for-linux,
	Mark Brown

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: f7af91adc230aa99e23330ecf85bc9badd9780ad  Add linux-next specific files for 20260528

Error/Warning (recently discovered and may have been fixed):

    https://lore.kernel.org/oe-kbuild-all/202605290432.Pyzd4kuW-lkp@intel.com
    https://lore.kernel.org/oe-kbuild-all/202605291041.seNEWvLQ-lkp@intel.com
    https://lore.kernel.org/oe-kbuild-all/202605291334.abNeADbG-lkp@intel.com

    drivers/net/arcnet/com20020-pci.c:225:52: warning: 'snprintf' output may be truncated before the last format character [-Wformat-truncation=]
    mm/memcontrol-v1.c:651:31: error: implicit declaration of function 'swp_cluster_offset' [-Wimplicit-function-declaration]
    samples/rust/rust_driver_auxiliary.o: warning: objtool: _RINvXs5_NtNtCshc5sK6KjdJJ_6kernel5alloc4kboxINtB6_3BoxINtNtNtCsbsbRiabPlh9_4core3mem12maybe_uninit11MaybeUninitINtNtBa_9auxiliary16RegistrationDataNtCseULRbgTYaTO_21rust_driver_auxiliary4DataEENtNtB8_9allocator7KmallocEINtCs57PXBekmiam_8pin_init12InPlaceWriteB1L_E14write_pin_initNtNtBa_5error5ErrorINtNtB3y_10___internal11InitClosureNCINvYIBH_B1L_B35_EINtNtBa_4init11InPlaceInitB1L_E8pin_initB4u_IB4O_NCINvMsc_B1O_INtB1O_12RegistrationB2l_E3newNtNtBX_7convert10InfallibleB2l_Es_0B1L_B4u_EE0B1L_B4u_EEB2n_: symbol name too long, can't create __pfx_ symbol

Unverified Error/Warning (likely false positive, kindly check if interested):

    block/partitions/ldm.c:1487:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/base/bus.c:801:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/dpll/dpll_core.c:238:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/hwtracing/intel_th/msu.c:1277:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/infiniband/hw/mthca/mthca_memfree.c:220:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/media/common/siano/smscoreapi.c:744:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/misc/eeprom/at25.c:466:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/pinctrl/core.c:2168:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/regulator/core.c:6262:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/staging/greybus/uart.c:917:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    drivers/usb/core/../misc/onboard_usb_dev_pdevs.c:124:1: internal compiler error: in final_scan_insn_1, at final.c:3073
    drivers/usb/dwc3/gadget.c:3474:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    fs/jfs/jfs_logmgr.c:1417:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    fs/notify/fanotify/fanotify_user.c:1746:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    fs/ntfs/super.c:2537:1: internal compiler error: in final_scan_insn_1, at final.cc:2813
    net/rxrpc/sendmsg.c:497:1: internal compiler error: in final_scan_insn_1, at final.cc:2813

Error/Warning ids grouped by kconfigs:

recent_errors
|-- alpha-randconfig-r072-20260529
|   `-- drivers-net-arcnet-com20020-pci.c:warning:snprintf-output-may-be-truncated-before-the-last-format-character
|-- csky-randconfig-001-20260529
|   |-- drivers-base-bus.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-misc-eeprom-at25.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-pinctrl-core.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-regulator-core.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-staging-greybus-uart.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-usb-dwc3-gadget.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- fs-jfs-jfs_logmgr.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   `-- fs-notify-fanotify-fanotify_user.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|-- csky-randconfig-002
|   `-- drivers-usb-core-..-misc-onboard_usb_dev_pdevs.c:internal-compiler-error:in-final_scan_insn_1-at-final.c
|-- csky-randconfig-r112-20260528
|   |-- block-partitions-ldm.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-dpll-dpll_core.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-media-common-siano-smscoreapi.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   `-- fs-ntfs-super.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|-- csky-randconfig-r122-20260528
|   |-- drivers-hwtracing-intel_th-msu.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   |-- drivers-infiniband-hw-mthca-mthca_memfree.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|   `-- net-rxrpc-sendmsg.c:internal-compiler-error:in-final_scan_insn_1-at-final.cc
|-- sh-allyesconfig
|   `-- mm-memcontrol-v1.c:error:implicit-declaration-of-function-swp_cluster_offset
`-- x86_64-randconfig-004-20260529
    `-- samples-rust-rust_driver_auxiliary.o:warning:objtool:_RINvXs5_NtNtCshc5sK6KjdJJ_6kernel5alloc4kboxINtB6_3BoxINtNtNtCsbsbRiabPlh9_4core3mem12maybe_uninit11MaybeUninitINtNtBa_9auxiliary16RegistrationDat

elapsed time: 753m

configs tested: 178
configs skipped: 2

tested configs:
alpha                             allnoconfig    gcc-15.2.0
alpha                            allyesconfig    gcc-15.2.0
alpha                               defconfig    gcc-15.2.0
arc                              allmodconfig    gcc-15.2.0
arc                               allnoconfig    gcc-15.2.0
arc                              allyesconfig    gcc-15.2.0
arc                                 defconfig    gcc-15.2.0
arc                   randconfig-001-20260529    gcc-14.3.0
arc                   randconfig-002-20260529    gcc-10.5.0
arm                               allnoconfig    clang-23
arm                              allyesconfig    gcc-15.2.0
arm                                 defconfig    clang-23
arm                                 defconfig    gcc-15.2.0
arm                   randconfig-001-20260529    gcc-14.3.0
arm                   randconfig-002-20260529    gcc-8.5.0
arm                   randconfig-003-20260529    gcc-13.4.0
arm                   randconfig-004-20260529    clang-23
arm64                            allmodconfig    clang-19
arm64                             allnoconfig    gcc-15.2.0
arm64                               defconfig    gcc-15.2.0
arm64                 randconfig-001-20260529    gcc-8.5.0
arm64                 randconfig-002-20260529    clang-23
arm64                 randconfig-003-20260529    clang-23
arm64                 randconfig-004-20260529    gcc-15.2.0
csky                             allmodconfig    gcc-15.2.0
csky                              allnoconfig    gcc-15.2.0
csky                                defconfig    gcc-15.2.0
csky                  randconfig-001-20260529    gcc-15.2.0
csky                  randconfig-002-20260529    gcc-15.2.0
hexagon                          allmodconfig    clang-17
hexagon                           allnoconfig    clang-23
hexagon                             defconfig    clang-23
hexagon                             defconfig    gcc-15.2.0
hexagon               randconfig-001-20260529    clang-23
hexagon               randconfig-002-20260529    clang-23
i386                             allmodconfig    gcc-14
i386                              allnoconfig    gcc-14
i386                             allyesconfig    gcc-14
i386        buildonly-randconfig-001-20260529    gcc-12
i386        buildonly-randconfig-002-20260529    clang-20
i386        buildonly-randconfig-003-20260529    gcc-12
i386        buildonly-randconfig-004-20260529    gcc-14
i386        buildonly-randconfig-005-20260529    gcc-14
i386        buildonly-randconfig-006-20260529    gcc-14
i386                                defconfig    clang-20
i386                                defconfig    gcc-15.2.0
i386                  randconfig-001-20260529    gcc-14
i386                  randconfig-002-20260529    gcc-12
i386                  randconfig-003-20260529    clang-20
i386                  randconfig-004-20260529    clang-20
i386                  randconfig-005-20260529    gcc-14
i386                  randconfig-006-20260529    clang-20
i386                  randconfig-007-20260529    gcc-12
i386                  randconfig-011-20260529    clang-20
i386                  randconfig-012-20260529    clang-20
i386                  randconfig-013-20260529    clang-20
i386                  randconfig-014-20260529    clang-20
i386                  randconfig-015-20260529    gcc-14
i386                  randconfig-016-20260529    clang-20
i386                  randconfig-017-20260529    gcc-13
loongarch                        allmodconfig    clang-19
loongarch                         allnoconfig    clang-23
loongarch                           defconfig    clang-19
loongarch             randconfig-001-20260529    gcc-15.2.0
loongarch             randconfig-002-20260529    gcc-15.2.0
m68k                             allmodconfig    gcc-15.2.0
m68k                              allnoconfig    gcc-15.2.0
m68k                             allyesconfig    gcc-15.2.0
m68k                                defconfig    gcc-15.2.0
microblaze                        allnoconfig    gcc-15.2.0
microblaze                       allyesconfig    gcc-15.2.0
microblaze                          defconfig    gcc-15.2.0
mips                             allmodconfig    gcc-15.2.0
mips                              allnoconfig    gcc-15.2.0
mips                             allyesconfig    gcc-15.2.0
nios2                            allmodconfig    gcc-11.5.0
nios2                             allnoconfig    gcc-11.5.0
nios2                               defconfig    gcc-11.5.0
nios2                 randconfig-001-20260529    gcc-8.5.0
nios2                 randconfig-002-20260529    gcc-8.5.0
openrisc                         allmodconfig    gcc-15.2.0
openrisc                          allnoconfig    gcc-15.2.0
openrisc                            defconfig    gcc-15.2.0
openrisc                  or1klitex_defconfig    gcc-15.2.0
parisc                           allmodconfig    gcc-15.2.0
parisc                            allnoconfig    gcc-15.2.0
parisc                           allyesconfig    gcc-15.2.0
parisc                              defconfig    gcc-15.2.0
parisc                randconfig-001-20260529    gcc-8.5.0
parisc                randconfig-002-20260529    gcc-8.5.0
parisc64                            defconfig    gcc-15.2.0
powerpc                          allmodconfig    gcc-15.2.0
powerpc                           allnoconfig    gcc-15.2.0
powerpc               randconfig-001-20260529    gcc-12.5.0
powerpc               randconfig-002-20260529    clang-19
powerpc64             randconfig-001-20260529    gcc-15.2.0
powerpc64             randconfig-002-20260529    clang-19
riscv                            allmodconfig    clang-23
riscv                             allnoconfig    gcc-15.2.0
riscv                            allyesconfig    clang-16
riscv                               defconfig    clang-23
riscv                               defconfig    gcc-15.2.0
riscv                 randconfig-001-20260529    gcc-8.5.0
riscv                 randconfig-002-20260529    gcc-9.5.0
s390                             allmodconfig    clang-18
s390                              allnoconfig    clang-23
s390                             allyesconfig    gcc-15.2.0
s390                                defconfig    clang-23
s390                                defconfig    gcc-15.2.0
s390                  randconfig-001-20260529    gcc-8.5.0
s390                  randconfig-002-20260529    clang-23
sh                               allmodconfig    gcc-15.2.0
sh                                allnoconfig    gcc-15.2.0
sh                               allyesconfig    gcc-15.2.0
sh                                  defconfig    gcc-15.2.0
sh                    randconfig-001-20260529    gcc-15.2.0
sh                    randconfig-002-20260529    gcc-15.2.0
sparc                             allnoconfig    gcc-15.2.0
sparc                               defconfig    gcc-15.2.0
sparc                 randconfig-001-20260529    gcc-8.5.0
sparc                 randconfig-002-20260529    gcc-15.2.0
sparc64                          allmodconfig    clang-23
sparc64                             defconfig    clang-20
sparc64               randconfig-001-20260529    gcc-11.5.0
sparc64               randconfig-002-20260529    gcc-8.5.0
um                               allmodconfig    clang-19
um                                allnoconfig    clang-23
um                               allyesconfig    gcc-14
um                                  defconfig    clang-23
um                             i386_defconfig    gcc-14
um                    randconfig-001-20260529    gcc-14
um                    randconfig-002-20260529    gcc-14
um                           x86_64_defconfig    clang-23
x86_64                           allmodconfig    clang-20
x86_64                            allnoconfig    clang-20
x86_64                           allyesconfig    clang-20
x86_64      buildonly-randconfig-001-20260529    gcc-14
x86_64      buildonly-randconfig-002-20260529    gcc-14
x86_64      buildonly-randconfig-003-20260529    gcc-14
x86_64      buildonly-randconfig-004-20260529    clang-20
x86_64      buildonly-randconfig-004-20260529    gcc-14
x86_64      buildonly-randconfig-005-20260529    clang-20
x86_64      buildonly-randconfig-005-20260529    gcc-14
x86_64      buildonly-randconfig-006-20260529    gcc-14
x86_64                              defconfig    gcc-14
x86_64                                  kexec    clang-20
x86_64                randconfig-001-20260529    gcc-14
x86_64                randconfig-002-20260529    clang-20
x86_64                randconfig-003-20260529    gcc-14
x86_64                randconfig-004-20260529    clang-20
x86_64                randconfig-005-20260529    clang-20
x86_64                randconfig-006-20260529    clang-20
x86_64                randconfig-011-20260529    gcc-14
x86_64                randconfig-012-20260529    gcc-14
x86_64                randconfig-013-20260529    clang-20
x86_64                randconfig-014-20260529    clang-20
x86_64                randconfig-015-20260529    gcc-14
x86_64                randconfig-016-20260529    clang-20
x86_64                randconfig-071-20260529    gcc-14
x86_64                randconfig-072-20260529    clang-20
x86_64                randconfig-073-20260529    clang-20
x86_64                randconfig-074-20260529    gcc-14
x86_64                randconfig-075-20260529    gcc-14
x86_64                randconfig-076-20260529    gcc-14
x86_64                               rhel-9.4    clang-20
x86_64                               rhel-9.4    gcc-14
x86_64                           rhel-9.4-bpf    gcc-14
x86_64                          rhel-9.4-func    clang-20
x86_64                          rhel-9.4-func    gcc-14
x86_64                    rhel-9.4-kselftests    clang-20
x86_64                    rhel-9.4-kselftests    gcc-14
x86_64                         rhel-9.4-kunit    gcc-14
x86_64                           rhel-9.4-ltp    gcc-14
x86_64                          rhel-9.4-rust    clang-20
xtensa                            allnoconfig    gcc-15.2.0
xtensa                           allyesconfig    gcc-15.2.0
xtensa                randconfig-001-20260529    gcc-11.5.0
xtensa                randconfig-002-20260529    gcc-8.5.0

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v3] cgroup/dmem: introduce a peak file
From: Maarten Lankhorst @ 2026-05-29  7:34 UTC (permalink / raw)
  To: Michal Koutný, Thadeu Lima de Souza Cascardo
  Cc: Tejun Heo, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Muchun Song, Andrew Morton, Jonathan Corbet,
	Shuah Khan, Maxime Ripard, Natalie Vock, Tvrtko Ursulin, cgroups,
	linux-kernel, linux-mm, linux-doc, dri-devel, kernel-dev
In-Reply-To: <ahCISfTlN10gD8e6@localhost.localdomain>



Den 2026-05-22 kl. 18:48, skrev Michal Koutný:
> On Thu, May 14, 2026 at 02:36:08PM -0300, Thadeu Lima de Souza Cascardo <cascardo@igalia.com> wrote:
>> Just like we have memory.peak, introduce a dmem.peak, which uses the
>> page_counter support for that.
>>
>> For now, make it read-only.
>>
>> This allows for memory usage monitoring without polling dmem.current when
>> the information needed is the maximum device memory used. That can be used
>> for capacity planning, such that dmem.max can be properly setup for a given
>> workload. It can also be used for debugging to determine whether a given
>> workload would have caused eviction or system memory use.
>>
>> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@igalia.com>
>> ---
>> Changes in v3:
>> - EDITME: describe what is new in this series revision.
>> - EDITME: use bulletpoints and terse descriptions.
>> - Link to v2: https://patch.msgid.link/20260513-dmem_peak-v2-1-dac06999db9e@igalia.com
>>
>> Changes in v2:
>> - Make it read-only for now and adjust documentation accordingly.
>> - Link to v1: https://patch.msgid.link/20260506-dmem_peak-v1-0-8d803eb3449c@igalia.com
>> ---
>>  Documentation/admin-guide/cgroup-v2.rst |  6 ++++++
>>  kernel/cgroup/dmem.c                    | 15 +++++++++++++++
>>  2 files changed, 21 insertions(+)
>>
>> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
>> index 6efd0095ed99..d103623b2be4 100644
>> --- a/Documentation/admin-guide/cgroup-v2.rst
>> +++ b/Documentation/admin-guide/cgroup-v2.rst
>> @@ -2808,6 +2808,12 @@ DMEM Interface Files
>>  	The semantics are the same as for the memory cgroup controller, and are
>>  	calculated in the same way.
>>  
>> +  dmem.peak
>> +	A read-only nested-keyed file that exists on non-root cgroups.
> 
> s/nested-keyed/flat-keyed/
> 
> 
> With that
> 
> Reviewed-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Maarten Lankhorst <dev@lankhorst.se>

With your r-b it's ok to push it to the dmemcg tree?

Kind regards,
~Maarten Lankhorst

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: kernel test robot @ 2026-05-29  8:47 UTC (permalink / raw)
  To: Yury Norov, Andrew Morton, David Hildenbrand, Zi Yan,
	Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, linux-kernel
  Cc: oe-kbuild-all, Linux Memory Management List, Yury Norov,
	Farhad Alemi, Waiman Long, Rasmus Villemoes, cgroups
In-Reply-To: <20260528190337.878027-1-ynorov@nvidia.com>

Hi Yury,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Yury-Norov/mm-don-t-allow-empty-relative-nodemask-in-mpol_relative_nodemask/20260529-030835
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260528190337.878027-1-ynorov%40nvidia.com
patch subject: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
config: x86_64-buildonly-randconfig-003-20260529 (https://download.01.org/0day-ci/archive/20260529/202605291631.6MATSv6v-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605291631.6MATSv6v-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605291631.6MATSv6v-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/mempolicy.c: In function 'mpol_relative_nodemask':
>> mm/mempolicy.c:377:24: error: 'return' with a value, in function returning void [-Wreturn-mismatch]
     377 |                 return -EINVAL;
         |                        ^
   mm/mempolicy.c:370:13: note: declared here
     370 | static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
         |             ^~~~~~~~~~~~~~~~~~~~~~

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for MFD_STMFX
   Depends on [n]: HAS_IOMEM [=y] && I2C [=y] && OF [=n]
   Selected by [y]:
   - PINCTRL_STMFX [=y] && PINCTRL [=y] && I2C [=y] && HAS_IOMEM [=y]


vim +/return +377 mm/mempolicy.c

   369	
   370	static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
   371					   const nodemask_t *rel)
   372	{
   373		unsigned int w = nodes_weight(*rel);
   374		nodemask_t tmp;
   375	
   376		if (w == 0)
 > 377			return -EINVAL;
   378	
   379		nodes_fold(tmp, *orig, w);
   380		nodes_onto(*ret, tmp, *rel);
   381	}
   382	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: kernel test robot @ 2026-05-29  8:58 UTC (permalink / raw)
  To: Yury Norov, Andrew Morton, David Hildenbrand, Zi Yan,
	Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, linux-kernel
  Cc: llvm, oe-kbuild-all, Linux Memory Management List, Yury Norov,
	Farhad Alemi, Waiman Long, Rasmus Villemoes, cgroups
In-Reply-To: <20260528190337.878027-1-ynorov@nvidia.com>

Hi Yury,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Yury-Norov/mm-don-t-allow-empty-relative-nodemask-in-mpol_relative_nodemask/20260529-030835
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260528190337.878027-1-ynorov%40nvidia.com
patch subject: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260529/202605291609.AR5UEvmT-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605291609.AR5UEvmT-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605291609.AR5UEvmT-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/mempolicy.c:377:3: warning: void function 'mpol_relative_nodemask' should not return a value [-Wreturn-mismatch]
     377 |                 return -EINVAL;
         |                 ^      ~~~~~~~
   1 warning generated.


vim +/mpol_relative_nodemask +377 mm/mempolicy.c

   369	
   370	static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
   371					   const nodemask_t *rel)
   372	{
   373		unsigned int w = nodes_weight(*rel);
   374		nodemask_t tmp;
   375	
   376		if (w == 0)
 > 377			return -EINVAL;
   378	
   379		nodes_fold(tmp, *orig, w);
   380		nodes_onto(*ret, tmp, *rel);
   381	}
   382	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [PATCH rdma-next v2 1/3] cgroup/rdma: extend charge/uncharge API with s64 amount parameter
From: Tao Cui @ 2026-05-29  9:07 UTC (permalink / raw)
  To: tj, hannes, mkoutny, leon, jgg; +Cc: linux-rdma, cgroups, Tao Cui
In-Reply-To: <20260529090733.2242822-1-cui.tao@linux.dev>

From: Tao Cui <cuitao@kylinos.cn>

Change struct rdmacg_resource fields (max, usage, peak) and all
charge/uncharge function signatures from int to s64 to prepare for
byte-sized resource tracking such as MR memory.

Replace match_int with a match_s64 helper that uses kstrtoll so the
user-space limit tokens accept 64-bit values.  All existing callers
pass amount=1 (count-based), so the change is transparent for
existing count-based resources.

The rpool->usage_sum counter continues to track the number of active
charge operations (not the sum of charged amounts); this is correct
because it governs rpool lifetime - a pool is releasable only when
all charges, regardless of amount, have been released.

Signed-off-by: Tao Cui <cuitao@kylinos.cn>
---
 drivers/infiniband/core/cgroup.c     | 10 ++--
 drivers/infiniband/core/core_priv.h  | 12 ++--
 drivers/infiniband/core/rdma_core.c  |  8 +--
 drivers/infiniband/core/uverbs_cmd.c |  4 +-
 include/linux/cgroup_rdma.h          |  7 ++-
 kernel/cgroup/rdma.c                 | 87 ++++++++++++++++++----------
 6 files changed, 83 insertions(+), 45 deletions(-)

diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
index 1f037fe01450..81e24de72392 100644
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -36,18 +36,20 @@ void ib_device_unregister_rdmacg(struct ib_device *device)
 
 int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
 			 struct ib_device *device,
-			 enum rdmacg_resource_type resource_index)
+			 enum rdmacg_resource_type resource_index,
+			 s64 amount)
 {
 	return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
-				 resource_index);
+				 resource_index, amount);
 }
 EXPORT_SYMBOL(ib_rdmacg_try_charge);
 
 void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 			struct ib_device *device,
-			enum rdmacg_resource_type resource_index)
+			enum rdmacg_resource_type resource_index,
+			s64 amount)
 {
 	rdmacg_uncharge(cg_obj->cg, &device->cg_device,
-			resource_index);
+			resource_index, amount);
 }
 EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a2c36666e6fc..345356d1e504 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -159,11 +159,13 @@ void ib_device_unregister_rdmacg(struct ib_device *device);
 
 int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
 			 struct ib_device *device,
-			 enum rdmacg_resource_type resource_index);
+			 enum rdmacg_resource_type resource_index,
+			 s64 amount);
 
 void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 			struct ib_device *device,
-			enum rdmacg_resource_type resource_index);
+			enum rdmacg_resource_type resource_index,
+			 s64 amount);
 #else
 static inline void ib_device_register_rdmacg(struct ib_device *device)
 {
@@ -175,14 +177,16 @@ static inline void ib_device_unregister_rdmacg(struct ib_device *device)
 
 static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
 				       struct ib_device *device,
-				       enum rdmacg_resource_type resource_index)
+				       enum rdmacg_resource_type resource_index,
+			       s64 amount)
 {
 	return 0;
 }
 
 static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 				      struct ib_device *device,
-				      enum rdmacg_resource_type resource_index)
+				      enum rdmacg_resource_type resource_index,
+			      s64 amount)
 {
 }
 #endif
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 5018ec837056..3268285b5478 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -437,7 +437,7 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
 		goto uobj_put;
 
 	ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
-				   RDMACG_RESOURCE_HCA_OBJECT);
+				   RDMACG_RESOURCE_HCA_OBJECT, 1);
 	if (ret)
 		goto remove;
 
@@ -526,7 +526,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
 static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
 {
 	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
-			   RDMACG_RESOURCE_HCA_OBJECT);
+			   RDMACG_RESOURCE_HCA_OBJECT, 1);
 
 	xa_erase(&uobj->ufile->idr, uobj->id);
 }
@@ -547,7 +547,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
 		return 0;
 
 	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
-			   RDMACG_RESOURCE_HCA_OBJECT);
+			   RDMACG_RESOURCE_HCA_OBJECT, 1);
 
 	return 0;
 }
@@ -878,7 +878,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
 	}
 
 	ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev,
-			   RDMACG_RESOURCE_HCA_HANDLE);
+			   RDMACG_RESOURCE_HCA_HANDLE, 1);
 
 	rdma_restrack_del(&ucontext->res);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 91a62d2ade4d..9540ac180711 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -234,7 +234,7 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
 	}
 
 	ret = ib_rdmacg_try_charge(&ucontext->cg_obj, ucontext->device,
-				   RDMACG_RESOURCE_HCA_HANDLE);
+				   RDMACG_RESOURCE_HCA_HANDLE, 1);
 	if (ret)
 		goto err;
 
@@ -273,7 +273,7 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
 
 err_uncharge:
 	ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
-			   RDMACG_RESOURCE_HCA_HANDLE);
+			   RDMACG_RESOURCE_HCA_HANDLE, 1);
 err:
 	mutex_unlock(&file->ucontext_lock);
 	up_read(&file->hw_destroy_rwsem);
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
index 404e746552ca..7146cefa95a6 100644
--- a/include/linux/cgroup_rdma.h
+++ b/include/linux/cgroup_rdma.h
@@ -7,6 +7,7 @@
 #define _CGROUP_RDMA_H
 
 #include <linux/cgroup.h>
+#include <linux/types.h>
 
 enum rdmacg_resource_type {
 	RDMACG_RESOURCE_HCA_HANDLE,
@@ -46,9 +47,11 @@ void rdmacg_unregister_device(struct rdmacg_device *device);
 /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 		      struct rdmacg_device *device,
-		      enum rdmacg_resource_type index);
+		      enum rdmacg_resource_type index,
+		      s64 amount);
 void rdmacg_uncharge(struct rdma_cgroup *cg,
 		     struct rdmacg_device *device,
-		     enum rdmacg_resource_type index);
+		     enum rdmacg_resource_type index,
+		     s64 amount);
 #endif	/* CONFIG_CGROUP_RDMA */
 #endif	/* _CGROUP_RDMA_H */
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 5e82a03b3270..519f7f537223 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -59,9 +59,9 @@ static char const *rdmacg_resource_names[] = {
 
 /* resource tracker for each resource of rdma cgroup */
 struct rdmacg_resource {
-	int max;
-	int usage;
-	int peak;
+	s64 max;
+	s64 usage;
+	s64 peak;
 };
 
 /*
@@ -105,13 +105,13 @@ static inline struct rdma_cgroup *get_current_rdmacg(void)
 }
 
 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
-			       int index, int new_max)
+			       int index, s64 new_max)
 {
-	if (new_max == S32_MAX) {
-		if (rpool->resources[index].max != S32_MAX)
+	if (new_max == S64_MAX) {
+		if (rpool->resources[index].max != S64_MAX)
 			rpool->num_max_cnt++;
 	} else {
-		if (rpool->resources[index].max == S32_MAX)
+		if (rpool->resources[index].max == S64_MAX)
 			rpool->num_max_cnt--;
 	}
 	rpool->resources[index].max = new_max;
@@ -122,7 +122,7 @@ static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 	int i;
 
 	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
-		set_resource_limit(rpool, i, S32_MAX);
+		set_resource_limit(rpool, i, S64_MAX);
 }
 
 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
@@ -206,7 +206,8 @@ get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
 static void
 uncharge_cg_locked(struct rdma_cgroup *cg,
 		   struct rdmacg_device *device,
-		   enum rdmacg_resource_type index)
+		   enum rdmacg_resource_type index,
+		   s64 amount)
 {
 	struct rdmacg_resource_pool *rpool;
 
@@ -222,7 +223,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
 		return;
 	}
 
-	rpool->resources[index].usage--;
+	rpool->resources[index].usage -= amount;
 
 	/*
 	 * A negative count (or overflow) is invalid,
@@ -307,14 +308,15 @@ static void rdmacg_event_locked(struct rdma_cgroup *cg,
 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
 				     struct rdmacg_device *device,
 				     struct rdma_cgroup *stop_cg,
-				     enum rdmacg_resource_type index)
+				     enum rdmacg_resource_type index,
+				     s64 amount)
 {
 	struct rdma_cgroup *p;
 
 	mutex_lock(&rdmacg_mutex);
 
 	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
-		uncharge_cg_locked(p, device, index);
+		uncharge_cg_locked(p, device, index, amount);
 
 	mutex_unlock(&rdmacg_mutex);
 
@@ -329,12 +331,13 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
  */
 void rdmacg_uncharge(struct rdma_cgroup *cg,
 		     struct rdmacg_device *device,
-		     enum rdmacg_resource_type index)
+		     enum rdmacg_resource_type index,
+		     s64 amount)
 {
 	if (index >= RDMACG_RESOURCE_MAX)
 		return;
 
-	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+	rdmacg_uncharge_hierarchy(cg, device, NULL, index, amount);
 }
 EXPORT_SYMBOL(rdmacg_uncharge);
 
@@ -343,6 +346,7 @@ EXPORT_SYMBOL(rdmacg_uncharge);
  * @rdmacg: pointer to rdma cgroup which will own this resource
  * @device: pointer to rdmacg device
  * @index: index of the resource to charge in cgroup (resource pool)
+ * @amount: amount to charge
  *
  * This function follows charging resource in hierarchical way.
  * It will fail if the charge would cause the new value to exceed the
@@ -361,7 +365,8 @@ EXPORT_SYMBOL(rdmacg_uncharge);
  */
 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 		      struct rdmacg_device *device,
-		      enum rdmacg_resource_type index)
+		      enum rdmacg_resource_type index,
+		      s64 amount)
 {
 	struct rdma_cgroup *cg, *p;
 	struct rdmacg_resource_pool *rpool;
@@ -371,6 +376,9 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 	if (index >= RDMACG_RESOURCE_MAX)
 		return -EINVAL;
 
+	if (amount <= 0)
+		return -EINVAL;
+
 	/*
 	 * hold on to css, as cgroup can be removed but resource
 	 * accounting happens on css.
@@ -384,8 +392,9 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 			ret = PTR_ERR(rpool);
 			goto err;
 		} else {
-			new = (s64)rpool->resources[index].usage + 1;
-			if (new > rpool->resources[index].max) {
+			new = rpool->resources[index].usage + amount;
+			if (new < rpool->resources[index].usage ||
+			    new > rpool->resources[index].max) {
 				ret = -EAGAIN;
 				goto err;
 			} else {
@@ -409,7 +418,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
 	if (ret == -EAGAIN)
 		rdmacg_event_locked(cg, p, device, index);
 	mutex_unlock(&rdmacg_mutex);
-	rdmacg_uncharge_hierarchy(cg, device, p, index);
+	rdmacg_uncharge_hierarchy(cg, device, p, index, amount);
 	return ret;
 }
 EXPORT_SYMBOL(rdmacg_try_charge);
@@ -477,6 +486,25 @@ static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
 	return NULL;
 }
 
+static int match_s64(substring_t *s, s64 *result)
+{
+	char *buf;
+	int ret;
+	s64 val;
+
+	buf = kmemdup_nul(s->from, s->to - s->from, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	ret = kstrtoll(buf, 0, &val);
+	kfree(buf);
+	if (ret)
+		return ret;
+	if (val < 0)
+		return -EINVAL;
+	*result = val;
+	return 0;
+}
+
 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 				       char *buf, size_t nbytes, loff_t off)
 {
@@ -486,7 +514,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 	struct rdmacg_device *device;
 	char *options = strstrip(buf);
 	char *p;
-	int *new_limits;
+	s64 *new_limits;
 	unsigned long enables = 0;
 	int i = 0, ret = 0;
 
@@ -497,7 +525,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 		goto err;
 	}
 
-	new_limits = kzalloc_objs(int, RDMACG_RESOURCE_MAX);
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(s64), GFP_KERNEL);
 	if (!new_limits) {
 		ret = -ENOMEM;
 		goto err;
@@ -506,7 +534,8 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 	/* parse resource limit tokens */
 	while ((p = strsep(&options, " \t\n"))) {
 		substring_t args[MAX_OPT_ARGS];
-		int tok, intval;
+		int tok;
+		s64 intval;
 
 		if (!*p)
 			continue;
@@ -514,7 +543,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 		tok = match_token(p, rdmacg_limit_tokens, args);
 		switch (tok) {
 		case RDMACG_HCA_HANDLE_VAL:
-			if (match_int(&args[0], &intval) || intval < 0) {
+			if (match_s64(&args[0], &intval)) {
 				ret = -EINVAL;
 				goto parse_err;
 			}
@@ -522,11 +551,11 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 			enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
 			break;
 		case RDMACG_HCA_HANDLE_MAX:
-			new_limits[RDMACG_RESOURCE_HCA_HANDLE] = S32_MAX;
+			new_limits[RDMACG_RESOURCE_HCA_HANDLE] = S64_MAX;
 			enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE);
 			break;
 		case RDMACG_HCA_OBJECT_VAL:
-			if (match_int(&args[0], &intval) || intval < 0) {
+			if (match_s64(&args[0], &intval)) {
 				ret = -EINVAL;
 				goto parse_err;
 			}
@@ -534,7 +563,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 			enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
 			break;
 		case RDMACG_HCA_OBJECT_MAX:
-			new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S32_MAX;
+			new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S64_MAX;
 			enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
 			break;
 		default:
@@ -588,7 +617,7 @@ static void print_rpool_values(struct seq_file *sf,
 {
 	enum rdmacg_file_type sf_type;
 	int i;
-	u32 value;
+	s64 value;
 
 	sf_type = seq_cft(sf)->private;
 
@@ -599,7 +628,7 @@ static void print_rpool_values(struct seq_file *sf,
 			if (rpool)
 				value = rpool->resources[i].max;
 			else
-				value = S32_MAX;
+				value = S64_MAX;
 		} else if (sf_type == RDMACG_RESOURCE_TYPE_PEAK) {
 			value = rpool ? rpool->resources[i].peak : 0;
 		} else {
@@ -609,10 +638,10 @@ static void print_rpool_values(struct seq_file *sf,
 				value = 0;
 		}
 
-		if (value == S32_MAX)
+		if (value == S64_MAX)
 			seq_puts(sf, RDMACG_MAX_STR);
 		else
-			seq_printf(sf, "%d", value);
+			seq_printf(sf, "%lld", value);
 		seq_putc(sf, ' ');
 	}
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH rdma-next v2 2/3] cgroup/rdma: add MR memory size resource tracking
From: Tao Cui @ 2026-05-29  9:07 UTC (permalink / raw)
  To: tj, hannes, mkoutny, leon, jgg; +Cc: linux-rdma, cgroups, Tao Cui
In-Reply-To: <20260529090733.2242822-1-cui.tao@linux.dev>

From: Tao Cui <cuitao@kylinos.cn>

Add RDMACG_RESOURCE_MR_MEM so that the cumulative memory size of
registered Memory Regions can be tracked and limited independently
from the aggregate hca_object counter.

Unlike count-based resources (hca_handle, hca_object) which are
charged in the generic IDR allocation path, MR_MEM is byte-based
and must be charged after the MR length is known.  Charge in the
uverbs MR registration handlers (ioctl and legacy), and uncharge
in the generic destroy paths (alloc_abort_idr_uobject,
destroy_hw_idr_uobject).

Store the charged byte count in uobj->rdmacg_mr_mem_bytes so that
the destroy path knows how much to uncharge.

Semantic notes
~~~~~~~~~~~~~~

mr_mem is not page-level ownership tracking - it is object-based
accounting tied to the MR lifetime:

  - charged at MR registration time
  - uncharged at MR destruction time
  - the charge lives with the MR's creating cgroup for the entire
    lifetime of the MR object

This model intentionally defines accounting semantics around MR
object lifetime rather than page ownership:

1. fork(): fork() does not duplicate MR objects.  Even though the
   child inherits the uverbs fd and can access the parent's ucontext,
   the MR remains a single kernel object.  The charge is tied to the
   MR object, not to the number of processes that can reach it, so
   no splitting or re-accounting is needed.

2. Cgroup migration: mr_mem follows the same semantics as the existing
   hca_object - charge at creation time against the invoking task's
   cgroup, uncharge at destruction time.  The RDMA cgroup does not
   implement can_attach/attach callbacks today, so charges do not
   migrate with the task.  This is a known limitation that applies
   equally to hca_handle and hca_object.  mr_mem does not introduce
   any new complication here.

3. Overlap with memory cgroup: mr_mem does not count process memory
   usage - it represents a per-device DMA registration budget: how
   much memory can this cgroup register through a given HCA.  This is
   a different dimension from what memory cgroup tracks.  An
   administrator might set mr_mem limits differently per device, which
   memory cgroup cannot express.

   In particular, mr_mem tracks the registered memory range associated
   with the MR rather than exact dynamically pinned pages (e.g. for
   ODP MRs).  This is a stable, policy-oriented approximation of
   registration footprint - not an attempt at precise physical page
   accounting.

Guard against u64-to-s64 overflow by rejecting MR lengths that
exceed S64_MAX at each registration site.

Handle MR reregistration (IB_USER_VERBS_CMD_REREG_MR with
IB_MR_REREG_TRANS) by computing the delta between old and new
lengths and charging or uncharging the difference.  When the driver
creates a new HW object (new_mr != NULL), the full new length is
charged to the new uobj and the old uobj's mr_mem is released
through the existing rdma_assign_uobject -> destroy_hw_idr_uobject
-> rdmacg_uncharge_uobj path.

Enable MR memory limits:

  echo "mlx5_0 mr_mem=1073741824" > rdma.max

Signed-off-by: Tao Cui <cuitao@kylinos.cn>
---
 drivers/infiniband/core/rdma_core.c           | 14 ++++-
 drivers/infiniband/core/uverbs_cmd.c          | 57 +++++++++++++++++++
 drivers/infiniband/core/uverbs_std_types_mr.c | 37 ++++++++++++
 include/linux/cgroup_rdma.h                   |  1 +
 include/rdma/ib_verbs.h                       |  1 +
 kernel/cgroup/rdma.c                          | 21 ++++++-
 6 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 3268285b5478..a540cef6bb67 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -523,10 +523,19 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
 	return ret;
 }
 
-static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+static void rdmacg_uncharge_uobj(struct ib_uobject *uobj)
 {
 	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
 			   RDMACG_RESOURCE_HCA_OBJECT, 1);
+	if (uobj->rdmacg_mr_mem_bytes)
+		ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+				   RDMACG_RESOURCE_MR_MEM,
+				   uobj->rdmacg_mr_mem_bytes);
+}
+
+static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+{
+	rdmacg_uncharge_uobj(uobj);
 
 	xa_erase(&uobj->ufile->idr, uobj->id);
 }
@@ -546,8 +555,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
 	if (why == RDMA_REMOVE_ABORT)
 		return 0;
 
-	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
-			   RDMACG_RESOURCE_HCA_OBJECT, 1);
+	rdmacg_uncharge_uobj(uobj);
 
 	return 0;
 }
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 9540ac180711..901de117c808 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -752,6 +752,17 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 
 	uobj->object = mr;
 	uobj_put_obj_read(pd);
+
+	if (cmd.length > S64_MAX)
+		goto err_free;
+	if (cmd.length) {
+		ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+					   RDMACG_RESOURCE_MR_MEM, cmd.length);
+		if (ret)
+			goto err_dereg;
+		uobj->rdmacg_mr_mem_bytes = cmd.length;
+	}
+
 	uobj_finalize_uobj_create(uobj, attrs);
 
 	resp.lkey = mr->lkey;
@@ -759,6 +770,8 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 	resp.mr_handle = uobj->id;
 	return uverbs_response(attrs, &resp, sizeof(resp));
 
+err_dereg:
+	ib_dereg_mr_user(mr, &attrs->driver_udata);
 err_put:
 	uobj_put_obj_read(pd);
 err_free:
@@ -854,6 +867,20 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
 		rdma_restrack_set_name(&new_mr->res, NULL);
 		rdma_restrack_add(&new_mr->res);
 
+		if ((cmd.flags & IB_MR_REREG_TRANS) && cmd.length) {
+			if (cmd.length > S64_MAX) {
+				ret = -EINVAL;
+				goto err_rereg_new_mr;
+			}
+			ret = ib_rdmacg_try_charge(&new_uobj->cg_obj,
+						   new_uobj->context->device,
+						   RDMACG_RESOURCE_MR_MEM,
+						   cmd.length);
+			if (ret)
+				goto err_rereg_new_mr;
+			new_uobj->rdmacg_mr_mem_bytes = cmd.length;
+		}
+
 		/*
 		 * The new uobj for the new HW object is put into the same spot
 		 * in the IDR and the old uobj & HW object is deleted.
@@ -871,6 +898,31 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
 			atomic_inc(&new_pd->usecnt);
 		}
 		if (cmd.flags & IB_MR_REREG_TRANS) {
+			s64 delta;
+
+			if (cmd.length > S64_MAX) {
+				ret = -EINVAL;
+				goto put_new_uobj;
+			}
+			delta = (s64)cmd.length -
+				(s64)uobj->rdmacg_mr_mem_bytes;
+
+			if (delta > 0) {
+				ret = ib_rdmacg_try_charge(
+					&uobj->cg_obj,
+					uobj->context->device,
+					RDMACG_RESOURCE_MR_MEM,
+					delta);
+				if (ret)
+					goto put_new_uobj;
+			} else if (delta < 0) {
+				ib_rdmacg_uncharge(
+					&uobj->cg_obj,
+					uobj->context->device,
+					RDMACG_RESOURCE_MR_MEM,
+					-delta);
+			}
+			uobj->rdmacg_mr_mem_bytes = cmd.length;
 			mr->iova = cmd.hca_va;
 			mr->length = cmd.length;
 		}
@@ -887,6 +939,11 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
 put_new_uobj:
 	if (new_uobj)
 		uobj_alloc_abort(new_uobj, attrs);
+err_rereg_new_mr:
+	if (new_uobj) {
+		rdma_alloc_abort_uobject(new_uobj, attrs, true);
+		new_uobj = NULL;
+	}
 put_uobj_pd:
 	if (cmd.flags & IB_MR_REREG_PD)
 		uobj_put_obj_read(new_pd);
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 570b9656801d..3989ff2d282b 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -32,6 +32,7 @@
  */
 
 #include "rdma_core.h"
+#include "core_priv.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 #include "restrack.h"
@@ -140,6 +141,18 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
 	rdma_restrack_set_name(&mr->res, NULL);
 	rdma_restrack_add(&mr->res);
 	uobj->object = mr;
+	if (attr.length > S64_MAX)
+		return -EINVAL;
+
+	if (attr.length) {
+		ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+					   RDMACG_RESOURCE_MR_MEM, attr.length);
+		if (ret) {
+			ib_dereg_mr_user(mr, &attrs->driver_udata);
+			return ret;
+		}
+		uobj->rdmacg_mr_mem_bytes = attr.length;
+	}
 
 	uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE);
 
@@ -254,6 +267,18 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
 	rdma_restrack_add(&mr->res);
 	uobj->object = mr;
 
+	if (length > S64_MAX)
+		return -EINVAL;
+	if (length) {
+		ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+					   RDMACG_RESOURCE_MR_MEM, length);
+		if (ret) {
+			ib_dereg_mr_user(mr, &attrs->driver_udata);
+			return ret;
+		}
+		uobj->rdmacg_mr_mem_bytes = length;
+	}
+
 	uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
 
 	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
@@ -383,6 +408,18 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
 	rdma_restrack_add(&mr->res);
 	uobj->object = mr;
 
+	if (length > S64_MAX)
+		return -EINVAL;
+	if (length) {
+		ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+					   RDMACG_RESOURCE_MR_MEM, length);
+		if (ret) {
+			ib_dereg_mr_user(mr, &attrs->driver_udata);
+			return ret;
+		}
+		uobj->rdmacg_mr_mem_bytes = length;
+	}
+
 	uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE);
 
 	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY,
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
index 7146cefa95a6..2c8fb1ebb1a9 100644
--- a/include/linux/cgroup_rdma.h
+++ b/include/linux/cgroup_rdma.h
@@ -12,6 +12,7 @@
 enum rdmacg_resource_type {
 	RDMACG_RESOURCE_HCA_HANDLE,
 	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MR_MEM,
 	RDMACG_RESOURCE_MAX,
 };
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9dd76f489a0b..c7dcd5d085fb 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1569,6 +1569,7 @@ struct ib_uobject {
 	void		       *object;		/* containing object */
 	struct list_head	list;		/* link to context's list */
 	struct ib_rdmacg_object	cg_obj;		/* rdmacg object */
+	s64			rdmacg_mr_mem_bytes; /* charged MR memory size */
 	int			id;		/* index into kernel idr */
 	struct kref		ref;
 	atomic_t		usecnt;		/* protects exclusive access */
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 519f7f537223..ebfc5721c098 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -23,14 +23,18 @@ enum rdmacg_limit_tokens {
 	RDMACG_HCA_HANDLE_MAX,
 	RDMACG_HCA_OBJECT_VAL,
 	RDMACG_HCA_OBJECT_MAX,
+	RDMACG_MR_MEM_VAL,
+	RDMACG_MR_MEM_MAX,
 	NR_RDMACG_LIMIT_TOKENS,
 };
 
 static const match_table_t rdmacg_limit_tokens = {
-	{ RDMACG_HCA_HANDLE_VAL,	"hca_handle=%d"	},
+	{ RDMACG_HCA_HANDLE_VAL,	"hca_handle=%d"		},
 	{ RDMACG_HCA_HANDLE_MAX,	"hca_handle=max"	},
-	{ RDMACG_HCA_OBJECT_VAL,	"hca_object=%d"	},
+	{ RDMACG_HCA_OBJECT_VAL,	"hca_object=%d"		},
 	{ RDMACG_HCA_OBJECT_MAX,	"hca_object=max"	},
+	{ RDMACG_MR_MEM_VAL,		"mr_mem=%d"		},
+	{ RDMACG_MR_MEM_MAX,		"mr_mem=max"		},
 	{ NR_RDMACG_LIMIT_TOKENS,	NULL			},
 };
 
@@ -55,6 +59,7 @@ enum rdmacg_file_type {
 static char const *rdmacg_resource_names[] = {
 	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
 	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
+	[RDMACG_RESOURCE_MR_MEM]	= "mr_mem",
 };
 
 /* resource tracker for each resource of rdma cgroup */
@@ -566,6 +571,18 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
 			new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S64_MAX;
 			enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT);
 			break;
+		case RDMACG_MR_MEM_VAL:
+			if (match_s64(&args[0], &intval)) {
+				ret = -EINVAL;
+				goto parse_err;
+			}
+			new_limits[RDMACG_RESOURCE_MR_MEM] = intval;
+			enables |= BIT(RDMACG_RESOURCE_MR_MEM);
+			break;
+		case RDMACG_MR_MEM_MAX:
+			new_limits[RDMACG_RESOURCE_MR_MEM] = S64_MAX;
+			enables |= BIT(RDMACG_RESOURCE_MR_MEM);
+			break;
 		default:
 			ret = -EINVAL;
 			goto parse_err;
-- 
2.43.0


^ permalink raw reply related

* [PATCH rdma-next v2 3/3] cgroup/rdma: update cgroup resource list for MR_MEM
From: Tao Cui @ 2026-05-29  9:07 UTC (permalink / raw)
  To: tj, hannes, mkoutny, leon, jgg; +Cc: linux-rdma, cgroups, Tao Cui
In-Reply-To: <20260529090733.2242822-1-cui.tao@linux.dev>

From: Tao Cui <cuitao@kylinos.cn>

The RDMA cgroup now supports MR memory size tracking via the new
mr_mem resource.  Update the cgroup-v2 documentation to describe
the new resource and revise the usage examples accordingly.

The mr_mem resource tracks the cumulative size of memory registered
through Memory Regions per device per cgroup, providing a DMA
registration budget that is orthogonal to the existing hca_object
counter.

Signed-off-by: Tao Cui <cuitao@kylinos.cn>
---
 Documentation/admin-guide/cgroup-v2.rst | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 993446ab66d0..08d80e6f79ec 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2766,15 +2766,16 @@ RDMA Interface Files
 
 	The following nested keys are defined.
 
-	  ==========	=============================
+	  ==========	================================================
 	  hca_handle	Maximum number of HCA Handles
 	  hca_object 	Maximum number of HCA Objects
-	  ==========	=============================
+	  mr_mem	Maximum cumulative MR memory size in bytes
+	  ==========	================================================
 
 	An example for mlx4 and ocrdma device follows::
 
-	  mlx4_0 hca_handle=2 hca_object=2000
-	  ocrdma1 hca_handle=3 hca_object=max
+	  mlx4_0 hca_handle=2 hca_object=2000 mr_mem=1073741824
+	  ocrdma1 hca_handle=3 hca_object=max mr_mem=max
 
   rdma.current
 	A read-only file that describes current resource usage.
@@ -2782,8 +2783,8 @@ RDMA Interface Files
 
 	An example for mlx4 and ocrdma device follows::
 
-	  mlx4_0 hca_handle=1 hca_object=20
-	  ocrdma1 hca_handle=1 hca_object=23
+	  mlx4_0 hca_handle=1 hca_object=20 mr_mem=1048576
+	  ocrdma1 hca_handle=1 hca_object=23 mr_mem=0
 
   rdma.peak
 	A read-only nested-keyed file that exists for all the cgroups
@@ -2792,8 +2793,8 @@ RDMA Interface Files
 
 	An example for mlx4 and ocrdma device follows::
 
-	  mlx4_0 hca_handle=1 hca_object=20
-	  ocrdma1 hca_handle=0 hca_object=23
+	  mlx4_0 hca_handle=1 hca_object=20 mr_mem=1048576
+	  ocrdma1 hca_handle=0 hca_object=23 mr_mem=0
 
   rdma.events
 	A read-only nested-keyed file which exists on non-root
@@ -2815,7 +2816,7 @@ RDMA Interface Files
 
 	An example for mlx4 device follows::
 
-	  mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=3 hca_object.max=0 hca_object.alloc_fail=0
+	  mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=3 hca_object.max=0 hca_object.alloc_fail=0 mr_mem.max=0 mr_mem.alloc_fail=0
 
   rdma.events.local
 	Similar to rdma.events but the fields in the file are local
@@ -2836,7 +2837,7 @@ RDMA Interface Files
 
 	An example for mlx4 device follows::
 
-	  mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=0 hca_object.max=0 hca_object.alloc_fail=0
+	  mlx4_0 hca_handle.max=5 hca_handle.alloc_fail=0 hca_object.max=0 hca_object.alloc_fail=0 mr_mem.max=0 mr_mem.alloc_fail=0
 
 DMEM
 ----
-- 
2.43.0


^ permalink raw reply related

* [PATCH rdma-next v2 0/3] cgroup/rdma: add MR memory size resource tracking
From: Tao Cui @ 2026-05-29  9:07 UTC (permalink / raw)
  To: tj, hannes, mkoutny, leon, jgg; +Cc: linux-rdma, cgroups, Tao Cui

From: Tao Cui <cuitao@kylinos.cn>

Currently the RDMA cgroup only tracks two aggregate counters:
hca_handle and hca_object.  The real scarce resource in multi-tenant
deployments is pinned memory: how much physical memory gets registered
through MRs.  The existing hca_object counter is too coarse to capture
this.

This series adds a single new resource type:

  - mr_mem  - Cumulative MR memory size in bytes

The per-object-type counters (qp, mr) from RFC v1 have been removed
per review feedback [1]: modern NICs pool objects from the same memory
pool so the distinction between QP count and MR count is not
meaningful for resource limiting.  hca_object remains sufficient for
coarse object accounting.

After this series, an administrator can set limits like:

    echo "mlx5_0 mr_mem=1073741824" > rdma.max

Design
~~~~~~

mr_mem is not page-level ownership tracking; it is object-based
accounting tied to the MR lifetime:

  - charged at MR registration time
  - uncharged at MR destruction time
  - the charge is pinned to the cgroup that created the MR for the
    entire lifetime of the MR object

This model intentionally defines accounting semantics around MR
object lifetime rather than page ownership:

1. fork(): fork() does not duplicate MR objects.  Even though the
   child inherits the uverbs fd and can access the parent's ucontext,
   the MR remains a single kernel object.  The charge is tied to the
   MR object, not to the number of processes that can reach it, so
   no splitting or re-accounting is needed.

2. Cgroup migration: mr_mem follows the same semantics as the existing
   hca_object; charge at creation time against the invoking task's
   cgroup, uncharge at destruction time.  The RDMA cgroup does not
   implement can_attach/attach callbacks today, so charges do not
   migrate with the task.  This is a known limitation that applies
   equally to hca_handle and hca_object.  mr_mem does not introduce
   any new complication here.

3. Overlap with memory cgroup: mr_mem does not count process memory
   usage; it represents a per-device DMA registration budget: the
   amount of memory this cgroup may register through a given HCA.
   This is a different dimension from what memory cgroup tracks.  An
   administrator might set mr_mem limits differently per device, which
   memory cgroup cannot express.

   In particular, mr_mem tracks the registered memory range associated
   with the MR rather than exact dynamically pinned pages (e.g. for
   ODP MRs).  This is a stable, policy-oriented approximation of
   registration footprint, not an attempt at precise physical page
   accounting.

Tao Cui (3):
  cgroup/rdma: extend charge/uncharge API with s64 amount parameter
  cgroup/rdma: add MR memory size resource tracking
  cgroup/rdma: update cgroup resource list for MR_MEM

 Documentation/admin-guide/cgroup-v2.rst       |  21 ++--
 drivers/infiniband/core/cgroup.c              |  10 +-
 drivers/infiniband/core/core_priv.h           |  12 +-
 drivers/infiniband/core/rdma_core.c           |  20 +++-
 drivers/infiniband/core/uverbs_cmd.c          |  61 +++++++++-
 drivers/infiniband/core/uverbs_std_types_mr.c |  37 ++++++
 include/linux/cgroup_rdma.h                   |   8 +-
 include/rdma/ib_verbs.h                       |   1 +
 kernel/cgroup/rdma.c                          | 108 +++++++++++++-----
 9 files changed, 219 insertions(+), 59 deletions(-)

---
Changes from RFC v1:

  - Removed RDMACG_RESOURCE_QP and RDMACG_RESOURCE_MR per-type
    counters following review feedback from Jason Gunthorpe [1].
  - Retained only RDMACG_RESOURCE_MR_MEM as the sole new resource.
  - Added detailed semantic notes to the commit messages addressing
    fork(), cgroup migration, and overlap with memory cgroup [2].
  - Renamed patches to reflect the narrower scope.

[1] https://lore.kernel.org/all/20260525134314.GI7702@ziepe.ca/
[2] https://lore.kernel.org/all/20260528075537.2170697-1-cuitao@kylinos.cn/
-- 
2.43.0


^ permalink raw reply

* Re: [PATCH v5 5/9] mm: list_lru: deduplicate lock_list_lru()
From: Wei Yang @ 2026-05-29  9:56 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes, Shakeel Butt,
	Michal Hocko, Dave Chinner, Roman Gushchin, Muchun Song, Qi Zheng,
	Yosry Ahmed, Zi Yan, Liam R . Howlett, Usama Arif,
	Kiryl Shutsemau, Vlastimil Babka, Kairui Song, Mikhail Zaslonko,
	Vasily Gorbik, Baolin Wang, Barry Song, Dev Jain, Lance Yang,
	Nico Pache, Ryan Roberts, cgroups, linux-mm, linux-kernel
In-Reply-To: <20260527204757.2544958-6-hannes@cmpxchg.org>

On Wed, May 27, 2026 at 04:45:12PM -0400, Johannes Weiner wrote:
>The MEMCG and !MEMCG paths have the same pattern. Share the code.
>
>Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
>Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
>Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
>Reviewed-by: Liam R. Howlett (Oracle) <liam@infradead.org>
>---
> mm/list_lru.c | 21 +++++++++------------
> 1 file changed, 9 insertions(+), 12 deletions(-)
>
>diff --git a/mm/list_lru.c b/mm/list_lru.c
>index 7d0523e44010..fdb3fe2ea64f 100644
>--- a/mm/list_lru.c
>+++ b/mm/list_lru.c
>@@ -15,6 +15,14 @@
> #include "slab.h"
> #include "internal.h"

Hi, Johannes

One very tiny nit below.

> 
>+static inline void lock_list_lru(struct list_lru_one *l, bool irq)

Here we use @irq.

>+{
>+	if (irq)
>+		spin_lock_irq(&l->lock);
>+	else
>+		spin_lock(&l->lock);
>+}
>+
> static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)

Here we use @irq_off.

Do you think it would be nicer to unify the parameter name?

Also the name in callsite and the argument annotation. Would it be cleaner
readers?

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* [RFC PATCH v2 0/9] mm: support zswap-backed large folio swapin
From: fujunjie @ 2026-05-29 12:17 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups

Hi,

This RFC explores large-folio swapin for ranges that are still fully backed
by zswap.

Large swapin is currently disabled once zswap is in the picture. Anonymous
faults stop considering large orders after zswap has ever been enabled,
shmem does the same, and zswap_load() refuses large swapcache folios. That
keeps mixed zswap/disk cases safe, but it also loses the dense case where
every slot in an aligned 64K range is still resident in zswap.

The series keeps the policy in common swapin code:

  - zswap reports backend facts and provides the large-folio load helper.
  - swapin_sync() filters candidate orders by backend range.
  - all-disk and zeromap ranges keep the existing Kairui large-swapin path.
  - mixed zswap/disk ranges stay order-0.
  - all-zswap ranges may use a 64K folio after locality admission.
  - anon provides locality evidence from VMA hints and PTE young density.
  - shmem starts with explicit VMA-hint evidence only.
  - swap readahead uses its existing VMA/cluster window as locality
    evidence; it does not also run the anon PTE-young rule.

The backend range probe is only a snapshot. If the backend changes after a
fresh large swapcache folio is allocated, the common path drops that folio
and falls back to order-0. zswap_load() can also return -EAGAIN for the
same retry path. If a late fault retry keeps the large folio in swapcache
instead of deleting it, the cgroup v1 memsw swap owner is committed before
returning.

This is mTHP/large-folio swapin. The mappings installed by do_swap_page()
are still PTE mappings, not PMD mappings. The expected win is fewer faults,
batched PTE/rmap work, and preserving the large folio across zswapin
instead of rebuilding the working set as order-0 pages.

Prior art: Usama Arif posted a related RFC on 2024-10-18:

  mm: zswap: add support for zswapin of large folios
  https://lore.kernel.org/linux-mm/20241018105026.2521366-1-usamaarif642@gmail.com/

This RFC keeps the same broad goal, but moves admission into common swapin
code. zswap does not decide the policy. Mixed zswap/disk ranges are
rejected before large IO, and the first cap is 64K.

This is a rewrite of the RFC posted on 2026-05-08:

  [RFC PATCH 0/5] mm: support zswap-backed anonymous large folio swapin
  https://lore.kernel.org/linux-mm/tencent_8B437BE4F586C162950BF71954316C1EDB05@qq.com/

The v1 series was anonymous-only and kept too much of the policy near the
anon fault and zswap paths. This version is rebuilt on top of Kairui Song's
common swapin infrastructure. It keeps admission in common swapin code,
rejects mixed zswap/disk large ranges, and adds separate locality producers
for anon, shmem and swap readahead.

Performance and behavior
========================

The A/B tables are 10-run measurements. Elapsed values are seconds,
shown as mean +/- sample standard deviation. "phase" or "refault" is the
measured refault subphase. "zswpin" counts zswap loads. "pswpin" counts
swap-ins from the real swap device; pswpin=0 means the refaults were served
by zswap even when a disk swap device was configured. "RFC 64K" is the mean
number of successful 64K swapins.

The numbers below show where the large path is used and where it is
rejected.

zram-backed zswap microbench, 64K mTHP, 8G guest:

+-----------------+----------------+----------------+--------+--------+--------+----------+
| workload        | base elapsed   | RFC elapsed    | delta  | phase  | zswpin | RFC 64K  |
+-----------------+----------------+----------------+--------+--------+--------+----------+
| usama_1g        | 11.260+/-0.235 | 10.301+/-0.140 | -8.5%  | -22.2% | 1.000x | 16381.1  |
| nohint_seq64    |  4.398+/-0.085 |  4.025+/-0.022 | -8.5%  | -21.1% | 1.000x |  6221.1  |
| seqhint_seq64   |  4.283+/-0.060 |  3.948+/-0.062 | -7.8%  | -20.6% | 1.000x |  6223.5  |
| stride64_sparse |  3.095+/-0.051 |  3.086+/-0.025 | -0.3%  |  +5.8% | 1.002x |     1.0  |
| random64_sparse |  3.095+/-0.046 |  3.076+/-0.016 | -0.6%  |  +0.7% | 1.001x |     0.0  |
| random64_full   |  4.423+/-0.067 |  4.405+/-0.018 | -0.4%  |  +0.1% | 1.000x |     0.0  |
+-----------------+----------------+----------------+--------+--------+--------+----------+

The usama_1g row follows the shape of the 2024 RFC benchmark: allocate 1G,
fill it with compressible per-page data, reclaim it through memory.reclaim,
then time the full integrity-check refault. The seq64 rows use a 512M
target and 768M of pressure. "sparse" touches one 4K page per 64K region, while
"full" touches every 4K page. "seqhint" uses MADV_SEQUENTIAL; "nohint" does
not.

Virtio-block swap device present, zswap enabled:

+-----------------+---------------+---------------+--------+---------+--------+--------+---------+
| workload        | base elapsed  | RFC elapsed   | delta  | refault | pswpin | zswpin | RFC 64K |
+-----------------+---------------+---------------+--------+---------+--------+--------+---------+
| seq64           | 4.399+/-0.100 | 4.279+/-0.216 | -2.7%  | -10.5%  | 0      | 1.000x | 3110.7  |
| stride64_sparse | 3.103+/-0.047 | 3.119+/-0.086 | +0.5%  |  +6.2%  | 0      | 0.999x |    0.0  |
| random64_sparse | 3.142+/-0.112 | 3.097+/-0.030 | -1.4%  |  -2.2%  | 0      | 0.999x |    0.1  |
| random64_full   | 4.473+/-0.147 | 4.445+/-0.088 | -0.6%  |  +0.9%  | 0      | 1.000x |    0.4  |
+-----------------+---------------+---------------+--------+---------+--------+--------+---------+

This run uses a real block swap device, but the refaulted data stayed in
zswap. It covers the all-zswap hit path with disk swap configured, not disk
read IO.

Virtio-block pressure/mixed run, zswap max_pool_percent=1,
low-compressibility full fill:

+-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+
| workload                      | base elapsed  | RFC elapsed   | delta  | refault | pswpin base/RFC | RFC zswpin | RFC 64K | fallback |
+-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+
| seq64_full_pressure           | 5.908+/-0.195 | 5.790+/-0.235 | -2.0%  |  +3.0%  | 90258/99038    | 20327      |   0.0   | 3730     |
| random64_sparse_full_pressure | 5.104+/-0.069 | 5.068+/-0.090 | -0.7%  |  -9.1%  |  6201/6196     |  1297      |   0.0   |    0     |
+-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+

This run reaches the disk-backed path: pswpin is non-zero in both base and
RFC. It is mainly fallback coverage. The RFC does not install 64K folios
for these disk/mixed-heavy ranges.

Policy matrix, virtio-block swap device present:

+------------------------------+----+------+--------+--------+-------+----------+
| case                         | pc | hint | pswpin | zswpin | zswpwb| 64K in   |
+------------------------------+----+------+--------+--------+-------+----------+
| pc0_seq                      | 0  | none | 0      | 99559  | 0     | 0        |
| pc3_seq                      | 3  | none | 0      | 99498  | 0     | 0        |
| pc4_seq                      | 4  | none | 0      | 99512  | 0     | 3109     |
| pc5_seq                      | 5  | none | 0      | 99657  | 0     | 3113     |
| hint_none_random_sparse      | 5  | none | 0      |  6265  | 0     | 0        |
| hint_random_seq              | 5  | rand | 0      | 99488  | 0     | 0        |
| mixed_seq_full               | 5  | none | 97725  | 20147  | 84    | 569      |
| mixed_random_sparse_full     | 5  | none |  6230  |  1302  | 0     | 0        |
+------------------------------+----+------+--------+--------+-------+----------+

The pc rows show the readahead-window gate. The hint_random_seq row shows
the explicit random hint veto. The mixed rows use a small zswap pool to
force disk/zswap split backing; most mixed ranges are rejected, while any
remaining 64K successes were all-zswap at the time of the fault.

Kbuild pressure, zram swap, 384M memcg:

+----------------------+----------+----------+--------+--------+----------+
| setup                | base     | RFC      | delta  | zswpin | RFC 64K  |
+----------------------+----------+----------+--------+--------+----------+
| zram swap, 384M memcg| 2060.323 | 2047.516 | -0.6%  | 0.991x | 2797     |
+----------------------+----------+----------+--------+--------+----------+

This is a single-run zram pressure smoke. It did not show Kbuild
regression, and the RFC run installed 64K zswap-backed folios. The result
should not be read as a tuned-performance claim.

Kbuild pressure, virtio-block swap device, 512M memcg:

+-------------------------+----------+----------+--------+--------+----------+
| setup                   | base     | RFC      | delta  | pswpin | RFC 64K  |
+-------------------------+----------+----------+--------+--------+----------+
| disk swap, 512M memcg   | 1420.671 | 1409.263 | -0.8%  | 0      | 7497     |
+-------------------------+----------+----------+--------+--------+----------+

This is a single-run pressure smoke. The disk-swap Kbuild run also stayed
on the all-zswap hit path, so it is pressure coverage with a disk swap device
present rather than proof of disk-read large swapin.

Shmem smoke, tmpfs huge=always, 64K shmem mTHP:

+----------------------------+---------------+---------+-------------+----------+
| case                       | refault hint  | touched | 64K shmem   | 64K in   |
+----------------------------+---------------+---------+-------------+----------+
| nohint_seq                 | none          | 65536   | 4096        | 0        |
| seq_refault_hint           | sequential    | 65536   | 4096        | 4096     |
| random_refault_hint_sparse | random        |  4096   | 4096        | 0        |
+----------------------------+---------------+---------+-------------+----------+

That matches the current shmem producer: explicit sequential refault hints
allow large zswap swapin; no hint and random hints do not.

What this RFC does not establish
================================

The 64K cap is deliberate, but it is not tuned. The anon PTE-young rule is
only anon evidence. Shmem has the framework and explicit VMA hints in this
RFC, not a page-cache locality producer. For larger orders, the anon
producer should probably use bounded sampling instead of walking every PTE
in a 1M or larger candidate range. The mixed-backend tests cover fallback
behavior, but this series does not add mixed zswap/disk large IO.

Changes since RFC v1:

  - rebuilt the series on Kairui Song's common swapin/swap-table work;
  - moved large-swapin admission into common swapin code;
  - made zswap provide range facts and fully-zswap-backed folio loads;
  - rejected mixed zswap/disk large ranges before large IO;
  - capped zswap-backed swapin at 64K for this RFC;
  - added locality producers for anon, shmem hints and swap readahead;
  - covered cgroup v1 memsw accounting in speculative large-swapcache
    fallback paths;
  - added 10-run microbench data, mixed-backend pressure tests, shmem
    smoke tests, and zram/disk Kbuild pressure data.

fujunjie (9):
  mm/zswap: expose range state for swapin policy
  mm: let swap_read_folio() report retryable zswap races
  mm/zswap: support fully zswap-backed large folio loads
  mm: admit large swapin by backend range in swapin_sync()
  mm: add common locality admission for zswap large swapin
  mm: provide anon locality evidence for zswap large swapin
  mm/shmem: provide VMA-hint locality for zswap large swapin
  mm: try all-zswap large swapin within swap readahead windows
  docs: mm: update THP swapin counter descriptions

 Documentation/admin-guide/mm/transhuge.rst |  11 +-
 include/linux/zswap.h                      |  26 +
 mm/memcontrol-v1.c                         |   8 +-
 mm/memory.c                                | 269 +++++++-
 mm/page_io.c                               |  19 +-
 mm/shmem.c                                 |  42 +-
 mm/swap.h                                  |  21 +-
 mm/swap_state.c                            | 681 +++++++++++++++++++--
 mm/swapfile.c                              |   2 +-
 mm/zswap.c                                 | 149 ++++-
 10 files changed, 1099 insertions(+), 129 deletions(-)


base-commit: 404fb4f38e8f38469dfff4df0205c9d18eeb1f57
-- 
2.34.1


^ permalink raw reply

* [RFC PATCH v2 1/9] mm/zswap: expose range state for swapin policy
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

Large folio swapin needs to know whether a candidate swap range is fully
backed by zswap before it can choose an order. That decision should stay
in common swapin code, not inside zswap.

Export two zswap facts for that caller: a lockless range occupancy snapshot
and the current zswap reclaim-pressure state. The range state is
advisory only. Writeback or invalidation can change the backend after the
snapshot, so users must recheck before issuing large-folio IO.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 include/linux/zswap.h | 26 +++++++++++++++++++++++++
 mm/zswap.c            | 44 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 30c193a1207e..8f9aee97517c 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -9,6 +9,18 @@ struct lruvec;
 
 extern atomic_long_t zswap_stored_pages;
 
+/*
+ * Advisory zswap occupancy snapshot for a swap range. This is not a complete
+ * backend classifier; callers must recheck before depending on ALL_ZSWAP for
+ * large-folio IO.
+ */
+enum zswap_range_state {
+	ZSWAP_RANGE_NEVER_ENABLED,
+	ZSWAP_RANGE_NO_ZSWAP,
+	ZSWAP_RANGE_ALL_ZSWAP,
+	ZSWAP_RANGE_MIXED,
+};
+
 #ifdef CONFIG_ZSWAP
 
 struct zswap_lruvec_state {
@@ -27,6 +39,9 @@ struct zswap_lruvec_state {
 unsigned long zswap_total_pages(void);
 bool zswap_store(struct folio *folio);
 int zswap_load(struct folio *folio);
+enum zswap_range_state zswap_probe_range(swp_entry_t swp,
+					 unsigned int nr_pages);
+bool zswap_pool_reclaim_pressure(void);
 void zswap_invalidate(swp_entry_t swp);
 int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
@@ -49,6 +64,17 @@ static inline int zswap_load(struct folio *folio)
 	return -ENOENT;
 }
 
+static inline enum zswap_range_state zswap_probe_range(swp_entry_t swp,
+						       unsigned int nr_pages)
+{
+	return ZSWAP_RANGE_NEVER_ENABLED;
+}
+
+static inline bool zswap_pool_reclaim_pressure(void)
+{
+	return false;
+}
+
 static inline void zswap_invalidate(swp_entry_t swp) {}
 static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..da5297f7bd69 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -506,6 +506,19 @@ unsigned long zswap_total_pages(void)
 	return total;
 }
 
+/*
+ * Expose whether zswap reclaim pressure is active. This is a backend fact:
+ * zswap_check_limits() sets the state once the pool reaches the hard limit and
+ * keeps it set until the pool falls below the accept threshold.
+ */
+bool zswap_pool_reclaim_pressure(void)
+{
+	if (zswap_never_enabled())
+		return false;
+
+	return READ_ONCE(zswap_pool_reached_full);
+}
+
 static bool zswap_check_limits(void)
 {
 	unsigned long cur_pages = zswap_total_pages();
@@ -1559,6 +1572,37 @@ bool zswap_store(struct folio *folio)
 	return ret;
 }
 
+enum zswap_range_state zswap_probe_range(swp_entry_t swp,
+					 unsigned int nr_pages)
+{
+	unsigned int type = swp_type(swp);
+	pgoff_t offset = swp_offset(swp);
+	bool present = false, missing = false;
+	unsigned int i;
+
+	/*
+	 * This is an advisory, lockless snapshot for common swapin admission.
+	 * Callers must recheck before depending on an all-zswap range for IO:
+	 * concurrent writeback or invalidation can change the backend state.
+	 */
+	if (zswap_never_enabled())
+		return ZSWAP_RANGE_NEVER_ENABLED;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct xarray *tree = swap_zswap_tree(swp_entry(type, offset + i));
+
+		if (xa_load(tree, offset + i))
+			present = true;
+		else
+			missing = true;
+
+		if (present && missing)
+			return ZSWAP_RANGE_MIXED;
+	}
+
+	return present ? ZSWAP_RANGE_ALL_ZSWAP : ZSWAP_RANGE_NO_ZSWAP;
+}
+
 /**
  * zswap_load() - load a folio from zswap
  * @folio: folio to load
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 2/9] mm: let swap_read_folio() report retryable zswap races
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

Large zswap loads need a way to ask the caller to drop a speculative large
swapcache folio and retry order-0. A void swap_read_folio() cannot express
that without turning a backend race into an IO failure.

Return int from swap_read_folio() and reserve -EAGAIN for retryable large
zswap races. Existing order-0 paths keep treating the read as before; the
synchronous swapin path only warns for now. A later patch will consume
-EAGAIN and retry order-0.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/page_io.c    | 19 +++++++++++++++++--
 mm/swap.h       |  5 +++--
 mm/swap_state.c | 13 +++++++++++--
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index f2d8fe7fd057..16724bdfb400 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -653,13 +653,21 @@ static void swap_read_folio_bdev_async(struct folio *folio,
 	submit_bio(bio);
 }
 
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+/*
+ * Return -EAGAIN only when a locked large swapcache folio hit a retryable
+ * zswap backend race. The caller owns that still-locked folio and must drop or
+ * retry it. Other zswap errors are still reported through the usual folio
+ * state: the folio is unlocked without PG_uptodate and the fault path will
+ * turn that into an I/O error.
+ */
+int swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 {
 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
 	bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
 	bool workingset = folio_test_workingset(folio);
 	unsigned long pflags;
 	bool in_thrashing;
+	int ret = 0;
 
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -681,8 +689,14 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 		goto finish;
 	}
 
-	if (zswap_load(folio) != -ENOENT)
+	ret = zswap_load(folio);
+	if (ret == -EAGAIN) {
+		VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
 		goto finish;
+	}
+	if (ret != -ENOENT)
+		goto finish;
+	ret = 0;
 
 	/* We have to read from slower devices. Increase zswap protection. */
 	zswap_folio_swapin(folio);
@@ -701,6 +715,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 		psi_memstall_leave(&pflags);
 	}
 	delayacct_swapin_end();
+	return ret;
 }
 
 void __swap_read_unplug(struct swap_iocb *sio)
diff --git a/mm/swap.h b/mm/swap.h
index 77d2d14eda42..ea7e1f3c4410 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -241,7 +241,7 @@ extern void __swap_cluster_free_entries(struct swap_info_struct *si,
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
 struct swap_iocb;
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
+int swap_read_folio(struct folio *folio, struct swap_iocb **plug);
 void __swap_read_unplug(struct swap_iocb *plug);
 static inline void swap_read_unplug(struct swap_iocb *plug)
 {
@@ -381,8 +381,9 @@ static inline void folio_put_swap(struct folio *folio, struct page *page)
 {
 }
 
-static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+static inline int swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 {
+	return 0;
 }
 
 static inline void swap_write_unplug(struct swap_iocb *sio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 04f5ce992401..d37097913b30 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -628,6 +628,7 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
 					   struct swap_iocb **plug, bool readahead)
 {
 	struct folio *folio;
+	int ret;
 
 	do {
 		folio = swap_cache_get_folio(entry);
@@ -639,7 +640,13 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
 	if (IS_ERR_OR_NULL(folio))
 		return NULL;
 
-	swap_read_folio(folio, plug);
+	ret = swap_read_folio(folio, plug);
+	/*
+	 * Swap readahead allocates order-0 folios. -EAGAIN is reserved for
+	 * retryable large zswap backend races and must be handled by the
+	 * synchronous common swapin path.
+	 */
+	VM_WARN_ON_ONCE(ret == -EAGAIN);
 	if (readahead) {
 		folio_set_readahead(folio);
 		count_vm_event(SWAP_RA);
@@ -668,6 +675,7 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
 			   struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
 {
 	struct folio *folio;
+	int ret;
 
 	do {
 		folio = swap_cache_get_folio(entry);
@@ -679,7 +687,8 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
 	if (IS_ERR(folio))
 		return folio;
 
-	swap_read_folio(folio, NULL);
+	ret = swap_read_folio(folio, NULL);
+	VM_WARN_ON_ONCE(ret == -EAGAIN);
 	return folio;
 }
 
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 5/9] mm: add common locality admission for zswap large swapin
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

Fully zswap-backed ranges are safe to load as a large folio only when
the caller has a reason to expect the neighbouring slots to be useful.
Otherwise a sparse refault can turn one 4K demand fault into a 64K
decompression and swapcache fill.

Add a common admission gate for zswap-backed large swapin. The common
layer keeps backend checks, the 64K cap, recent-refault rejection, and
zswap reclaim-pressure rejection. It consumes a caller-provided locality
order mask instead of looking at anon or shmem state directly.

Callers pass no locality evidence for now, so this patch only installs
the common policy hook. Later patches add anon and shmem producers.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     |   2 +-
 mm/shmem.c      |   2 +-
 mm/swap.h       |   8 ++--
 mm/swap_state.c | 118 ++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d73a19692dea..92a82008d583 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4849,7 +4849,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
 			folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
 					    thp_swapin_suitable_orders(vmf) | BIT(0),
-					    vmf, NULL, 0);
+					    0, vmf, NULL, 0);
 		else
 			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 56c23a7b15c7..fa99b48ed62b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2031,7 +2031,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
 
 again:
 	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
-	folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx);
+	folio = swapin_sync(entry, gfp, BIT(order), 0, vmf, mpol, ilx);
 	mpol_cond_put(mpol);
 
 	if (!IS_ERR(folio))
diff --git a/mm/swap.h b/mm/swap.h
index ea7e1f3c4410..dd35a310d06d 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -323,9 +323,10 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 		struct mempolicy *mpol, pgoff_t ilx);
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
-		struct vm_fault *vmf);
+			struct vm_fault *vmf);
 struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
-			   struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx);
+			  unsigned long locality_orders, struct vm_fault *vmf,
+			  struct mempolicy *mpol, pgoff_t ilx);
 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
 			   unsigned long addr);
 
@@ -418,7 +419,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 
 static inline struct folio *swapin_sync(
 	swp_entry_t entry, gfp_t flag, unsigned long orders,
-	struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
+	unsigned long locality_orders, struct vm_fault *vmf,
+	struct mempolicy *mpol, pgoff_t ilx)
 {
 	return NULL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f03ad4832f16..5a4ca289009a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,7 @@
 #include <linux/migrate.h>
 #include <linux/vmalloc.h>
 #include <linux/huge_mm.h>
+#include <linux/sizes.h>
 #include <linux/zswap.h>
 #include <linux/shmem_fs.h>
 #include "internal.h"
@@ -556,6 +557,24 @@ static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
 					mpol, ilx, true);
 }
 
+/*
+ * Initial conservative cap for speculative zswap large swapin. Locality
+ * evidence is supplied by the caller or by generic VMA hints; the common
+ * swapin layer keeps backend safety and pressure decisions here.
+ */
+#define SWAPIN_ZSWAP_MAX_SIZE			SZ_64K
+#if PAGE_SIZE < SWAPIN_ZSWAP_MAX_SIZE
+#define SWAPIN_ZSWAP_MAX_ORDER			\
+	ilog2(SWAPIN_ZSWAP_MAX_SIZE / PAGE_SIZE)
+#else
+#define SWAPIN_ZSWAP_MAX_ORDER			0
+#endif
+
+struct zswap_admit_ctx {
+	bool pressure_checked;
+	bool reclaim_pressure;
+};
+
 static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
 {
 	unsigned int ci_start = swp_cluster_offset(entry);
@@ -586,11 +605,84 @@ static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
 	return true;
 }
 
+static bool swapin_zswap_locality(struct vm_fault *vmf, unsigned int order,
+				  unsigned long locality_orders)
+{
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+
+	if (!order || order > MAX_PAGE_ORDER)
+		return false;
+
+	if (vma && (vma->vm_flags & VM_RAND_READ))
+		return false;
+
+	return locality_orders & BIT(order);
+}
+
+static bool swapin_zswap_refaulted(swp_entry_t entry, unsigned int nr_pages)
+{
+	unsigned int type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	unsigned int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		bool workingset;
+		void *shadow;
+
+		shadow = swap_cache_get_shadow(swp_entry(type, offset + i));
+		if (!shadow)
+			continue;
+		if (workingset_test_recent(shadow, false, &workingset, false) &&
+		    workingset)
+			return true;
+	}
+
+	return false;
+}
+
+static bool swapin_zswap_admit(swp_entry_t entry,
+			       unsigned int order, unsigned int nr_pages,
+			       struct vm_fault *vmf,
+			       unsigned long locality_orders,
+			       struct zswap_admit_ctx *ctx)
+{
+	if (order > SWAPIN_ZSWAP_MAX_ORDER)
+		return false;
+
+	/*
+	 * Treat zswap-backed large swapin as speculative. The common layer
+	 * consumes caller-provided locality orders, but does not inspect
+	 * anon-specific PTE state or shmem-specific mapping state directly.
+	 */
+	if (!swapin_zswap_locality(vmf, order, locality_orders))
+		return false;
+
+	/*
+	 * A recent workingset refault shadow in the target range means reclaim
+	 * already saw churn there. Keep the refault path narrow instead of
+	 * speculatively decompressing neighbouring slots.
+	 */
+	if (swapin_zswap_refaulted(entry, nr_pages))
+		return false;
+
+	if (!ctx->pressure_checked) {
+		ctx->reclaim_pressure = zswap_pool_reclaim_pressure();
+		ctx->pressure_checked = true;
+	}
+	if (ctx->reclaim_pressure)
+		return false;
+
+	return true;
+}
+
 static unsigned long swapin_admit_orders(swp_entry_t entry,
-					 unsigned long orders)
+					 unsigned long orders,
+					 struct vm_fault *vmf,
+					 unsigned long locality_orders)
 {
 	unsigned long candidates = orders & ~BIT(0);
 	unsigned long admitted = orders & BIT(0);
+	struct zswap_admit_ctx zswap_ctx = {};
 	int order;
 
 	if (!candidates)
@@ -616,9 +708,14 @@ static unsigned long swapin_admit_orders(swp_entry_t entry,
 
 		state = zswap_probe_range(range_entry, nr_pages);
 		switch (state) {
+		case ZSWAP_RANGE_ALL_ZSWAP:
+			admit = swapin_zswap_admit(range_entry, order,
+						   nr_pages, vmf,
+						   locality_orders,
+						   &zswap_ctx);
+			break;
 		case ZSWAP_RANGE_MIXED:
 			break;
-		case ZSWAP_RANGE_ALL_ZSWAP:
 		case ZSWAP_RANGE_NEVER_ENABLED:
 		case ZSWAP_RANGE_NO_ZSWAP:
 			admit = true;
@@ -769,8 +866,8 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
 	ret = swap_read_folio(folio, plug);
 	/*
 	 * Swap readahead allocates order-0 folios. -EAGAIN is reserved for
-	 * retryable large zswap backend races and must be handled by the
-	 * synchronous common swapin path.
+	 * retryable large zswap backend races and should never escape to this
+	 * order-0 path.
 	 */
 	VM_WARN_ON_ONCE(ret == -EAGAIN);
 	if (readahead) {
@@ -786,6 +883,7 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
  * @entry: swap entry indicating the target slot
  * @gfp: memory allocation flags
  * @orders: allocation orders
+ * @locality_orders: orders with caller-provided locality evidence
  * @vmf: fault information
  * @mpol: NUMA memory allocation policy to be applied
  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
@@ -794,16 +892,20 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
  * existing folio in the swap cache for @entry. This initiates the IO, too,
  * if needed. @entry is rounded down if @orders allow large allocation.
  *
- * Context: Caller must ensure @entry is valid and pin the swap device with refcount.
+ * Context: Caller must ensure @entry is valid and pin the swap device with
+ * refcount.
  * Return: Returns the folio on success, error code if failed.
  */
-struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
-			   struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
+struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp,
+			  unsigned long orders,
+			  unsigned long locality_orders,
+			  struct vm_fault *vmf, struct mempolicy *mpol,
+			  pgoff_t ilx)
 {
 	struct folio *folio;
 	int ret;
 
-	orders = swapin_admit_orders(entry, orders);
+	orders = swapin_admit_orders(entry, orders, vmf, locality_orders);
 again:
 	do {
 		folio = swap_cache_get_folio(entry);
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 3/9] mm/zswap: support fully zswap-backed large folio loads
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

zswap currently refuses large swapcache folios. That is correct for mixed
backend ranges, but it also prevents the common swapin path from loading a
range that is still fully backed by zswap.

Teach zswap_load() to fill a locked large swapcache folio by decompressing
each base-page entry into the matching folio offset, then flushing the
folio once. A missing entry after zswap data has been seen is reported as
-EAGAIN so the caller can drop the speculative large folio and retry
order-0.

The large load keeps the zswap entries in place. It is a clean speculative
fill: until the swap slots are freed, zswap remains the backing copy if
reclaim drops the large folio before PTEs are installed.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/zswap.c | 105 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 18 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index da5297f7bd69..94ba112a2982 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -15,6 +15,8 @@
 
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/huge_mm.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -934,7 +936,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
-static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio,
+			     unsigned int page_idx, bool flush_dcache)
 {
 	struct zswap_pool *pool = entry->pool;
 	struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */
@@ -952,14 +955,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 		WARN_ON_ONCE(input->length != PAGE_SIZE);
 
-		dst = kmap_local_folio(folio, 0);
+		dst = kmap_local_folio(folio, page_idx * PAGE_SIZE);
 		memcpy_from_sglist(dst, input, 0, PAGE_SIZE);
 		dlen = PAGE_SIZE;
 		kunmap_local(dst);
-		flush_dcache_folio(folio);
+		if (flush_dcache)
+			flush_dcache_folio(folio);
 	} else {
 		sg_init_table(&output, 1);
-		sg_set_folio(&output, folio, PAGE_SIZE, 0);
+		sg_set_folio(&output, folio, PAGE_SIZE, page_idx * PAGE_SIZE);
 		acomp_request_set_params(acomp_ctx->req, input, &output,
 					 entry->length, PAGE_SIZE);
 		ret = crypto_acomp_decompress(acomp_ctx->req);
@@ -1042,7 +1046,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 		goto out;
 	}
 
-	if (!zswap_decompress(entry, folio)) {
+	if (!zswap_decompress(entry, folio, 0, true)) {
 		ret = -EIO;
 		goto out;
 	}
@@ -1615,10 +1619,9 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
  *  NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
  *  will SIGBUS).
  *
- *  -EINVAL: if the swapped out content was in zswap, but the page belongs
- *  to a large folio, which is not supported by zswap. The folio is unlocked,
- *  but NOT marked up-to-date, so that an IO error is emitted (e.g.
- *  do_swap_page() will SIGBUS).
+ *  -EAGAIN: if the swapped out content belongs to a large folio, but the
+ *  range is mixed or raced with writeback. The folio remains locked so the
+ *  caller can drop the large swapcache folio and retry order-0.
  *
  *  -ENOENT: if the swapped out content was not in zswap. The folio remains
  *  locked on return.
@@ -1626,9 +1629,12 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
 int zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
+	unsigned int nr_pages = folio_nr_pages(folio);
+	unsigned int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
-	struct xarray *tree = swap_zswap_tree(swp);
+	struct xarray *tree;
 	struct zswap_entry *entry;
+	unsigned int i;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1636,21 +1642,84 @@ int zswap_load(struct folio *folio)
 	if (zswap_never_enabled())
 		return -ENOENT;
 
-	/*
-	 * Large folios should not be swapped in while zswap is being used, as
-	 * they are not properly handled. Zswap does not properly load large
-	 * folios, and a large folio may only be partially in zswap.
-	 */
-	if (WARN_ON_ONCE(folio_test_large(folio))) {
+	if (folio_test_large(folio)) {
+		struct obj_cgroup *first_objcg = NULL;
+		bool same_objcg = true;
+		bool saw_zswap = false;
+		bool saw_non_zswap = false;
+
+		/*
+		 * The locked large swapcache folio now covers the range and
+		 * conflicts with zswap writeback's order-0 swapcache allocation.
+		 * If the range is mixed or an entry disappears, retry order-0.
+		 */
+		for (i = 0; i < nr_pages; i++) {
+			tree = swap_zswap_tree(swp_entry(type, offset + i));
+			entry = xa_load(tree, offset + i);
+			if (!entry) {
+				if (saw_zswap)
+					return -EAGAIN;
+				saw_non_zswap = true;
+				continue;
+			}
+			if (saw_non_zswap)
+				return -EAGAIN;
+
+			if (!saw_zswap)
+				first_objcg = entry->objcg;
+			else if (entry->objcg != first_objcg)
+				same_objcg = false;
+			saw_zswap = true;
+		}
+		if (!saw_zswap)
+			return -ENOENT;
+
+		for (i = 0; i < nr_pages; i++) {
+			tree = swap_zswap_tree(swp_entry(type, offset + i));
+			entry = xa_load(tree, offset + i);
+			if (!entry)
+				return -EAGAIN;
+
+			if (!zswap_decompress(entry, folio, i, false)) {
+				folio_unlock(folio);
+				return -EIO;
+			}
+		}
+
+		flush_dcache_folio(folio);
+		/*
+		 * Keep zswap entries until swap slots are freed. This is a clean
+		 * speculative fill; zswap remains the backing copy if reclaim
+		 * drops the large folio before PTEs are installed.
+		 */
+		folio_mark_uptodate(folio);
+		count_vm_events(ZSWPIN, nr_pages);
+		count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+
+		if (same_objcg) {
+			if (first_objcg)
+				count_objcg_events(first_objcg, ZSWPIN, nr_pages);
+		} else {
+			for (i = 0; i < nr_pages; i++) {
+				tree = swap_zswap_tree(swp_entry(type, offset + i));
+				entry = xa_load(tree, offset + i);
+				if (WARN_ON_ONCE(!entry))
+					continue;
+				if (entry->objcg)
+					count_objcg_events(entry->objcg, ZSWPIN, 1);
+			}
+		}
+
 		folio_unlock(folio);
-		return -EINVAL;
+		return 0;
 	}
 
+	tree = swap_zswap_tree(swp);
 	entry = xa_load(tree, offset);
 	if (!entry)
 		return -ENOENT;
 
-	if (!zswap_decompress(entry, folio)) {
+	if (!zswap_decompress(entry, folio, 0, true)) {
 		folio_unlock(folio);
 		return -EIO;
 	}
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 6/9] mm: provide anon locality evidence for zswap large swapin
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

The common zswap large-swapin policy needs locality evidence from
callers before it can admit a large folio. For anonymous faults, provide
that evidence from existing VMA hints and from the PTE young state left
by earlier zswap-backed large swapins.

Keep non-faulting PTEs old when mapping a speculative all-zswap large
folio. A later fault can then require a dense young previous range before
admitting another large swapin without adding VMA state.

This also removes the old zswap-enabled guard from the THP swapin
candidate scan. The common swapin path now classifies the backend range
and falls back to order-0 for mixed zswap/disk ranges or races.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     | 234 +++++++++++++++++++++++++++++++++++++++++++-----
 mm/swap.h       |   6 ++
 mm/swap_state.c |  15 ++++
 3 files changed, 235 insertions(+), 20 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 92a82008d583..7bbb89632000 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4556,6 +4556,35 @@ static void memcg1_swapin_retry_folio(struct folio *folio,
 	folio_unlock(folio);
 }
 
+static void set_swapin_ptes(struct vm_area_struct *vma,
+			    unsigned long address, pte_t *ptep, pte_t pte,
+			    unsigned int nr_pages, unsigned int fault_pte_idx,
+			    bool fault_only_young)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t old_pte;
+
+	if (!fault_only_young || nr_pages == 1) {
+		set_ptes(mm, address, ptep, pte, nr_pages);
+		return;
+	}
+
+	old_pte = pte_mkold(pte);
+	if (fault_pte_idx)
+		set_ptes(mm, address, ptep, old_pte, fault_pte_idx);
+
+	set_pte_at(mm, address + fault_pte_idx * PAGE_SIZE,
+		   ptep + fault_pte_idx,
+		   pte_mkyoung(pte_advance_pfn(pte, fault_pte_idx)));
+
+	fault_pte_idx++;
+	if (fault_pte_idx < nr_pages)
+		set_ptes(mm, address + fault_pte_idx * PAGE_SIZE,
+			 ptep + fault_pte_idx,
+			 pte_advance_pfn(old_pte, fault_pte_idx),
+			 nr_pages - fault_pte_idx);
+}
+
 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 {
 	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
@@ -4628,6 +4657,157 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define SWAPIN_ANON_YOUNG_MIN_PERCENT		75
+#define SWAPIN_ANON_MAX_FAULT_SKIP_SHIFT	2
+
+static bool swapin_anon_prev_young_dense(struct vm_fault *vmf,
+					 unsigned int order)
+{
+	struct vm_area_struct *vma;
+	unsigned int nr_pages;
+	unsigned int threshold;
+	unsigned long size;
+	unsigned long base, prev, addr;
+	struct folio *first = NULL;
+	unsigned int present = 0;
+	unsigned int young = 0;
+	pmd_t *pmd;
+	pmd_t pmdval;
+	spinlock_t *ptl; /* protects the previous PTE range */
+	pte_t *ptep;
+	unsigned int i;
+
+	if (!IS_ENABLED(CONFIG_MMU) || !arch_has_hw_pte_young() || !vmf ||
+	    !vmf->vma || !vmf->pmd || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	nr_pages = 1U << order;
+	threshold = DIV_ROUND_UP(nr_pages *
+				 SWAPIN_ANON_YOUNG_MIN_PERCENT, 100);
+	size = PAGE_SIZE << order;
+
+	vma = vmf->vma;
+	base = ALIGN_DOWN(vmf->address, size);
+	if (base < size)
+		return false;
+
+	prev = base - size;
+	if (prev < vma->vm_start || prev + size > vma->vm_end)
+		return false;
+
+	pmd = vmf->pmd;
+	if ((prev & PMD_MASK) != (base & PMD_MASK)) {
+		pmd = mm_find_pmd(vma->vm_mm, prev);
+		if (!pmd)
+			return false;
+	}
+
+	pmdval = pmdp_get_lockless(pmd);
+	if (!pmd_present(pmdval) || pmd_leaf(pmdval))
+		return false;
+
+	ptep = pte_offset_map_lock(vma->vm_mm, pmd, prev, &ptl);
+	if (!ptep)
+		return false;
+
+	for (i = 0, addr = prev; i < nr_pages; i++, addr += PAGE_SIZE) {
+		struct folio *folio;
+		pte_t pte = ptep_get(ptep + i);
+
+		if (!pte_present(pte))
+			break;
+
+		folio = vm_normal_folio(vma, addr, pte);
+		if (!folio || folio_order(folio) != order)
+			break;
+		if (!first)
+			first = folio;
+		else if (folio != first)
+			break;
+
+		present++;
+		if (pte_young(pte))
+			young++;
+	}
+
+	pte_unmap_unlock(ptep, ptl);
+	if (present != nr_pages)
+		return false;
+
+	return young >= threshold;
+}
+
+static bool swapin_anon_accessed_neighbour(struct vm_fault *vmf,
+					   unsigned int order)
+{
+	unsigned long size;
+	unsigned long base;
+	unsigned long fault_idx;
+	unsigned long max_skip;
+
+	if (!vmf || !vmf->vma || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	size = PAGE_SIZE << order;
+	base = ALIGN_DOWN(vmf->address, size);
+
+	/*
+	 * Without a sequential hint, require prior young-density evidence and
+	 * only allow faults near the start of the candidate range.
+	 */
+	fault_idx = (vmf->address - base) >> PAGE_SHIFT;
+	max_skip = (1UL << order) >> SWAPIN_ANON_MAX_FAULT_SKIP_SHIFT;
+	if (fault_idx > max_skip)
+		return false;
+
+	return swapin_anon_prev_young_dense(vmf, order);
+}
+
+static bool swapin_anon_fault_starts_range(struct vm_fault *vmf,
+					   unsigned int order)
+{
+	struct vm_area_struct *vma;
+	unsigned long size;
+	unsigned long base;
+	unsigned long first;
+
+	if (!vmf || !vmf->vma || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	vma = vmf->vma;
+	size = PAGE_SIZE << order;
+	base = ALIGN_DOWN(vmf->address, size);
+	first = ALIGN(vma->vm_start, size);
+
+	return base == first && vmf->address == base &&
+	       base + size <= vma->vm_end;
+}
+
+static unsigned long swapin_anon_locality_orders(struct vm_fault *vmf,
+						 unsigned long orders)
+{
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+	unsigned long locality_orders = 0;
+	unsigned long candidates = orders & ~BIT(0);
+	int order;
+
+	if (vma && (vma->vm_flags & VM_RAND_READ))
+		return 0;
+
+	if (vma && (vma->vm_flags & VM_SEQ_READ))
+		return candidates;
+
+	while (candidates) {
+		order = fls_long(candidates) - 1;
+		if (swapin_anon_fault_starts_range(vmf, order) ||
+		    swapin_anon_accessed_neighbour(vmf, order))
+			locality_orders |= BIT(order);
+		candidates &= ~BIT(order);
+	}
+
+	return locality_orders;
+}
+
 /*
  * Check if the PTEs within a range are contiguous swap entries.
  */
@@ -4644,9 +4824,9 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
 		return false;
 	/*
-	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
+	 * swap_read_folio() can't do mixed-backend large folio IO. The common
+	 * synchronous swapin path will recheck backend state and fall back to
+	 * order-0 if a zswap/disk race makes the range mixed.
 	 */
 	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
 		return false;
@@ -4693,14 +4873,6 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		return 0;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		return 0;
-
 	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
@@ -4708,10 +4880,13 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 	 */
 	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
 					  BIT(PMD_ORDER) - 1);
+	if (!orders)
+		return 0;
 	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+	if (!orders)
+		return 0;
 	orders = thp_swap_suitable_orders(swp_offset(entry),
 					  vmf->address, orders);
-
 	if (!orders)
 		return 0;
 
@@ -4741,6 +4916,12 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 {
 	return 0;
 }
+
+static unsigned long swapin_anon_locality_orders(struct vm_fault *vmf,
+						 unsigned long orders)
+{
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /* Sanity check that a folio is fully exclusive */
@@ -4777,6 +4958,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	unsigned long page_idx;
 	unsigned long address;
 	pte_t *ptep;
+	bool fault_only_young = false;
 
 	if (!pte_unmap_same(vmf))
 		goto out;
@@ -4845,13 +5027,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (folio)
 		swap_update_readahead(folio, vma, vmf->address);
 	if (!folio) {
-		/* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
-		if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+		/*
+		 * Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices.
+		 * The swap device is pinned while checking the flag, matching
+		 * the existing fault path.
+		 */
+		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+			unsigned long swapin_orders = thp_swapin_suitable_orders(vmf);
+			unsigned long locality_orders =
+				swapin_anon_locality_orders(vmf, swapin_orders);
+
 			folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
-					    thp_swapin_suitable_orders(vmf) | BIT(0),
-					    0, vmf, NULL, 0);
-		else
+					    swapin_orders | BIT(0),
+					    locality_orders, vmf, NULL, 0);
+		} else {
 			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
+		}
 
 		if (IS_ERR_OR_NULL(folio)) {
 			/*
@@ -5110,9 +5301,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 	VM_BUG_ON(!folio_test_anon(folio) ||
 			(pte_write(pte) && !PageAnonExclusive(page)));
-	set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
-	arch_do_swap_page_nr(vma->vm_mm, vma, address,
-			pte, pte, nr_pages);
+	if (folio == swapcache && nr_pages == folio_nr_pages(folio) &&
+	    arch_has_hw_pte_young())
+		fault_only_young = swapin_fault_only_young(folio);
+	set_swapin_ptes(vma, address, ptep, pte, nr_pages, page_idx,
+			fault_only_young);
+	arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages);
 
 	/*
 	 * Remove the swap entry and conditionally try to free up the swapcache.
diff --git a/mm/swap.h b/mm/swap.h
index dd35a310d06d..5d1c81ab49b9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -327,6 +327,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
 struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
 			  unsigned long locality_orders, struct vm_fault *vmf,
 			  struct mempolicy *mpol, pgoff_t ilx);
+bool swapin_fault_only_young(struct folio *folio);
 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
 			   unsigned long addr);
 
@@ -430,6 +431,11 @@ static inline void swap_update_readahead(struct folio *folio,
 {
 }
 
+static inline bool swapin_fault_only_young(struct folio *folio)
+{
+	return false;
+}
+
 static inline int swap_writeout(struct folio *folio,
 		struct swap_iocb **swap_plug)
 {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5a4ca289009a..80dff6a1ee65 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -747,6 +747,21 @@ static bool zswap_needs_order0_retry(struct folio *folio)
 	       ZSWAP_RANGE_MIXED;
 }
 
+/*
+ * A speculative large swapin may install PTEs for pages that did not fault.
+ * Keep those non-faulting PTEs old so a later anon fault can report
+ * PTE-young density as caller-provided locality evidence without storing
+ * state in the VMA.
+ */
+bool swapin_fault_only_young(struct folio *folio)
+{
+	if (!folio_test_large(folio) || !folio_test_swapcache(folio))
+		return false;
+
+	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
+	       ZSWAP_RANGE_ALL_ZSWAP;
+}
+
 /*
  * If we are the only user, then try to free up the swap cache.
  *
-- 
2.34.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox