Re: [PATCH v3 05/12] mm, swap: unify large folio allocation

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

From: Baolin Wang <baolin.wang@linux.alibaba.com>
To: kasong@tencent.com, linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>, Zi Yan <ziy@nvidia.com>,
	Barry Song <baohua@kernel.org>, Hugh Dickins <hughd@google.com>,
	Chris Li <chrisl@kernel.org>,
	Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Youngjun Park <youngjun.park@lge.com>,
	Chengming Zhou <chengming.zhou@linux.dev>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
	Yosry Ahmed <yosry@kernel.org>, Lorenzo Stoakes <ljs@kernel.org>,
	Dev Jain <dev.jain@arm.com>, Lance Yang <lance.yang@linux.dev>,
	Michal Hocko <mhocko@suse.com>, Michal Hocko <mhocko@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Axel Rasmussen <axelrasmussen@google.com>
Subject: Re: [PATCH v3 05/12] mm, swap: unify large folio allocation
Date: Tue, 12 May 2026 18:10:31 +0800	[thread overview]
Message-ID: <d5341d37-5644-4446-a406-9a7251b83399@linux.alibaba.com> (raw)
In-Reply-To: <20260421-swap-table-p4-v3-5-2f23759a76bc@tencent.com>



On 4/21/26 2:16 PM, Kairui Song via B4 Relay wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Now that direct large order allocation is supported in the swap cache,
> both anon and shmem can use it instead of implementing their own methods.
> This unifies the fallback and swap cache check, which also reduces the
> TOCTOU race window of swap cache state: previously, high order swapin
> required checking swap cache states first, then allocating and falling
> back separately. Now all these steps happen in the same compact loop.
> 
> Order fallback and statistics are also unified, callers just need to
> check and pass the acceptable order bitmask.
> 
> There is basically no behavior change. This only makes things more
> unified and prepares for later commits. Cgroup and zero map checks can
> also be moved into the compact loop, further reducing race windows and
> redundancy
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>   mm/memory.c     |  77 ++++++------------------------
>   mm/shmem.c      |  94 +++++++++---------------------------
>   mm/swap.h       |  30 ++----------
>   mm/swap_state.c | 145 ++++++++++----------------------------------------------
>   mm/swapfile.c   |   3 +-
>   5 files changed, 67 insertions(+), 282 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index ea6568571131..404734a5bcff 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4593,26 +4593,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>   	return VM_FAULT_SIGBUS;
>   }
>   
> -static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
> -{
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio;
> -	softleaf_t entry;
> -
> -	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
> -	if (!folio)
> -		return NULL;
> -
> -	entry = softleaf_from_pte(vmf->orig_pte);
> -	if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -					   GFP_KERNEL, entry)) {
> -		folio_put(folio);
> -		return NULL;
> -	}
> -
> -	return folio;
> -}
> -
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>   /*
>    * Check if the PTEs within a range are contiguous swap entries
> @@ -4642,8 +4622,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
>   	 */
>   	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
>   		return false;
> -	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> -		return false;
>   
>   	return true;
>   }
> @@ -4671,16 +4649,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
>   	return orders;
>   }
>   
> -static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
>   	unsigned long orders;
> -	struct folio *folio;
>   	unsigned long addr;
>   	softleaf_t entry;
>   	spinlock_t *ptl;
>   	pte_t *pte;
> -	gfp_t gfp;
>   	int order;
>   
>   	/*
> @@ -4688,7 +4664,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>   	 * maintain the uffd semantics.
>   	 */
>   	if (unlikely(userfaultfd_armed(vma)))
> -		goto fallback;
> +		return 0;
>   
>   	/*
>   	 * A large swapped out folio could be partially or fully in zswap. We
> @@ -4696,7 +4672,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>   	 * folio.
>   	 */
>   	if (!zswap_never_enabled())
> -		goto fallback;
> +		return 0;
>   
>   	entry = softleaf_from_pte(vmf->orig_pte);
>   	/*
> @@ -4710,12 +4686,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>   					  vmf->address, orders);
>   
>   	if (!orders)
> -		goto fallback;
> +		return 0;
>   
>   	pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
>   				  vmf->address & PMD_MASK, &ptl);
>   	if (unlikely(!pte))
> -		goto fallback;
> +		return 0;
>   
>   	/*
>   	 * For do_swap_page, find the highest order where the aligned range is
> @@ -4731,29 +4707,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>   
>   	pte_unmap_unlock(pte, ptl);
>   
> -	/* Try allocating the highest of the remaining orders. */
> -	gfp = vma_thp_gfp_mask(vma);
> -	while (orders) {
> -		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> -		folio = vma_alloc_folio(gfp, order, vma, addr);
> -		if (folio) {
> -			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -							    gfp, entry))
> -				return folio;
> -			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
> -			folio_put(folio);
> -		}
> -		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> -		order = next_order(&orders, order);
> -	}
> -
> -fallback:
> -	return __alloc_swap_folio(vmf);
> +	return orders;
>   }
>   #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
> -static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
>   {
> -	return __alloc_swap_folio(vmf);
> +	return 0;
>   }
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   
> @@ -4859,21 +4818,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   	if (folio)
>   		swap_update_readahead(folio, vma, vmf->address);
>   	if (!folio) {
> -		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> -			folio = alloc_swap_folio(vmf);
> -			if (folio) {
> -				/*
> -				 * folio is charged, so swapin can only fail due
> -				 * to raced swapin and return NULL.
> -				 */
> -				swapcache = swapin_folio(entry, folio);
> -				if (swapcache != folio)
> -					folio_put(folio);
> -				folio = swapcache;
> -			}
> -		} else {
> +		/* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
> +		if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
> +			folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE,
> +					     thp_swapin_suitable_orders(vmf),
> +					     vmf, NULL, 0);
> +		else
>   			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
> -		}
>   
>   		if (!folio) {
>   			/*
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5916acf594a8..17e3da11bb1d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void)
>   
>   static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   			struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
> -			struct vm_area_struct *vma, vm_fault_t *fault_type);
> +			struct vm_fault *vmf, vm_fault_t *fault_type);
>   
>   static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
>   {
> @@ -2017,68 +2017,24 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
>   }
>   
>   static struct folio *shmem_swap_alloc_folio(struct inode *inode,
> -		struct vm_area_struct *vma, pgoff_t index,
> +		struct vm_fault *vmf, pgoff_t index,
>   		swp_entry_t entry, int order, gfp_t gfp)
>   {
> +	pgoff_t ilx;
> +	struct folio *folio;
> +	struct mempolicy *mpol;
> +	unsigned long orders = BIT(order);
>   	struct shmem_inode_info *info = SHMEM_I(inode);
> -	struct folio *new, *swapcache;
> -	int nr_pages = 1 << order;
> -	gfp_t alloc_gfp = gfp;
> -
> -	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> -		if (WARN_ON_ONCE(order))
> -			return ERR_PTR(-EINVAL);
> -	} else if (order) {
> -		/*
> -		 * If uffd is active for the vma, we need per-page fault
> -		 * fidelity to maintain the uffd semantics, then fallback
> -		 * to swapin order-0 folio, as well as for zswap case.
> -		 * Any existing sub folio in the swap cache also blocks
> -		 * mTHP swapin.
> -		 */
> -		if ((vma && unlikely(userfaultfd_armed(vma))) ||
> -		     !zswap_never_enabled() ||
> -		     non_swapcache_batch(entry, nr_pages) != nr_pages)
> -			goto fallback;
>   
> -		alloc_gfp = thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
> -	}
> -retry:
> -	new = shmem_alloc_folio(alloc_gfp, order, info, index);
> -	if (!new) {
> -		new = ERR_PTR(-ENOMEM);
> -		goto fallback;
> -	}
> +	if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
> +	     !zswap_never_enabled())
> +		orders = 0;
>   
> -	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
> -					   alloc_gfp, entry)) {
> -		folio_put(new);
> -		new = ERR_PTR(-ENOMEM);
> -		goto fallback;
> -	}
> +	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
> +	folio = swapin_entry(entry, gfp, orders, vmf, mpol, ilx);
> +	mpol_cond_put(mpol);
>   
> -	swapcache = swapin_folio(entry, new);
> -	if (swapcache != new) {
> -		folio_put(new);
> -		if (!swapcache) {
> -			/*
> -			 * The new folio is charged already, swapin can
> -			 * only fail due to another raced swapin.
> -			 */
> -			new = ERR_PTR(-EEXIST);
> -			goto fallback;
> -		}
> -	}
> -	return swapcache;
> -fallback:
> -	/* Order 0 swapin failed, nothing to fallback to, abort */
> -	if (!order)
> -		return new;
> -	entry.val += index - round_down(index, nr_pages);
> -	alloc_gfp = gfp;
> -	nr_pages = 1;
> -	order = 0;
> -	goto retry;
> +	return folio;
>   }

IIUC, in the __swap_cache_alloc() implementation in patch 4, when shmem 
swapin falls back to order 0, it doesn't adjust the swap entry value 
like here. Because the original swap entry may not correspond to the 
swap entry for the order 0 index.

Of course, I haven't tested this yet, just pointing it out for you to 
double check.

next prev parent reply	other threads:[~2026-05-12 10:10 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20260421-swap-table-p4-v3-0-2f23759a76bc@tencent.com>
     [not found] ` <20260421-swap-table-p4-v3-1-2f23759a76bc@tencent.com>
2026-05-06 13:51   ` [PATCH v3 01/12] mm, swap: simplify swap cache allocation helper Chris Li
2026-05-11  8:57     ` Kairui Song
     [not found] ` <20260421-swap-table-p4-v3-2-2f23759a76bc@tencent.com>
2026-05-06 14:42   ` [PATCH v3 02/12] mm, swap: move common swap cache operations into standalone helpers Chris Li
2026-05-12 14:48     ` Kairui Song
     [not found] ` <20260421-swap-table-p4-v3-3-2f23759a76bc@tencent.com>
2026-05-06 14:46   ` [PATCH v3 03/12] mm/huge_memory: move THP gfp limit helper into header Chris Li
     [not found]   ` <D631DCC9-85F0-4E68-88A0-AD5DE328818E@nvidia.com>
     [not found]     ` <CAMgjq7BDmGWaVWBL+52_c=jgs293bgB+Qe-MafKE7dWZRsmx9A@mail.gmail.com>
     [not found]       ` <125AABD0-02D5-4656-9F55-4B5BFBD5BD3D@nvidia.com>
2026-05-12  9:02         ` Baolin Wang
     [not found] ` <20260421-swap-table-p4-v3-6-2f23759a76bc@tencent.com>
2026-05-06 20:57   ` [PATCH v3 06/12] mm/memcg, swap: tidy up cgroup v1 memsw swap helpers Chris Li
     [not found] ` <20260421-swap-table-p4-v3-7-2f23759a76bc@tencent.com>
2026-05-08  4:01   ` [PATCH v3 07/12] mm, swap: support flexible batch freeing of slots in different memcgs Chris Li
     [not found] ` <20260421-swap-table-p4-v3-8-2f23759a76bc@tencent.com>
2026-05-08  4:46   ` [PATCH v3 08/12] mm, swap: delay and unify memcg lookup and charging for swapin Chris Li
     [not found] ` <20260421-swap-table-p4-v3-9-2f23759a76bc@tencent.com>
2026-05-08  5:02   ` [PATCH v3 09/12] mm, swap: consolidate cluster allocation helpers Chris Li
     [not found] ` <20260421-swap-table-p4-v3-10-2f23759a76bc@tencent.com>
2026-05-08 22:46   ` [PATCH v3 10/12] mm/memcg, swap: store cgroup id in cluster table directly Chris Li
     [not found] ` <20260421-swap-table-p4-v3-11-2f23759a76bc@tencent.com>
2026-05-08 22:47   ` [PATCH v3 11/12] mm/memcg: remove no longer used swap cgroup array Chris Li
     [not found] ` <20260421-swap-table-p4-v3-5-2f23759a76bc@tencent.com>
2026-05-06 20:48   ` [PATCH v3 05/12] mm, swap: unify large folio allocation Chris Li
2026-05-11 12:57   ` David Hildenbrand (Arm)
2026-05-11 14:37     ` Kairui Song
2026-05-11 15:15       ` David Hildenbrand (Arm)
2026-05-11 16:44         ` Kairui Song
2026-05-12  6:07           ` David Hildenbrand (Arm)
2026-05-12 10:10   ` Baolin Wang [this message]
     [not found] ` <20260421-swap-table-p4-v3-12-2f23759a76bc@tencent.com>
2026-05-11 16:30   ` [PATCH v3 12/12] mm, swap: merge zeromap into swap table Chris Li
2026-05-11 16:34 ` [PATCH v3 00/12] mm, swap: swap table phase IV: unify allocation and reduce static metadata Chris Li
     [not found] ` <CAMgjq7CJ8Are6m7X2UxUoJ=77c_oSpdG8-bzkmdRzwey2Cp1gQ@mail.gmail.com>
2026-05-11 21:12   ` Andrew Morton
2026-05-12  5:10     ` Kairui Song
     [not found] ` <20260421-swap-table-p4-v3-4-2f23759a76bc@tencent.com>
2026-05-06 20:27   ` [PATCH v3 04/12] mm, swap: add support for stable large allocation in swap cache directly Chris Li
2026-05-12  9:48   ` Baolin Wang
2026-05-12  9:55     ` Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d5341d37-5644-4446-a406-9a7251b83399@linux.alibaba.com \
    --to=baolin.wang@linux.alibaba.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kasong@tencent.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@kernel.org \
    --cc=mhocko@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=surenb@google.com \
    --cc=yosry@kernel.org \
    --cc=youngjun.park@lge.com \
    --cc=zhengqi.arch@bytedance.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox