Linux cgroups development

Linux cgroups development
 help / color / mirror / Atom feed

* Re: [RFC PATCH v2 3/9] mm/zswap: support fully zswap-backed large folio loads
From: Nhat Pham @ 2026-05-29 18:25 UTC (permalink / raw)
  To: fujunjie
  Cc: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif,
	Chris Li, Johannes Weiner, Yosry Ahmed, David Hildenbrand,
	Hugh Dickins, Roman Gushchin, Shakeel Butt, linux-kernel, cgroups
In-Reply-To: <tencent_7D186EDC2C9AB9009F9915C1E68F3CF44609@qq.com>

On Fri, May 29, 2026 at 5:19 AM fujunjie <fujunjie1@qq.com> wrote:
>
> zswap currently refuses large swapcache folios. That is correct for mixed
> backend ranges, but it also prevents the common swapin path from loading a
> range that is still fully backed by zswap.
>
> Teach zswap_load() to fill a locked large swapcache folio by decompressing
> each base-page entry into the matching folio offset, then flushing the
> folio once. A missing entry after zswap data has been seen is reported as
> -EAGAIN so the caller can drop the speculative large folio and retry
> order-0.
>
> The large load keeps the zswap entries in place. It is a clean speculative
> fill: until the swap slots are freed, zswap remains the backing copy if
> reclaim drops the large folio before PTEs are installed.
>
> Signed-off-by: fujunjie <fujunjie1@qq.com>
> ---
>  mm/zswap.c | 105 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 87 insertions(+), 18 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index da5297f7bd69..94ba112a2982 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -15,6 +15,8 @@
>
>  #include <linux/module.h>
>  #include <linux/cpu.h>
> +#include <linux/mm.h>
> +#include <linux/huge_mm.h>
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
> @@ -934,7 +936,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>         return comp_ret == 0 && alloc_ret == 0;
>  }
>
> -static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio,
> +                            unsigned int page_idx, bool flush_dcache)
>  {
>         struct zswap_pool *pool = entry->pool;
>         struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */
> @@ -952,14 +955,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
>                 WARN_ON_ONCE(input->length != PAGE_SIZE);
>
> -               dst = kmap_local_folio(folio, 0);
> +               dst = kmap_local_folio(folio, page_idx * PAGE_SIZE);
>                 memcpy_from_sglist(dst, input, 0, PAGE_SIZE);
>                 dlen = PAGE_SIZE;
>                 kunmap_local(dst);
> -               flush_dcache_folio(folio);
> +               if (flush_dcache)
> +                       flush_dcache_folio(folio);
>         } else {
>                 sg_init_table(&output, 1);
> -               sg_set_folio(&output, folio, PAGE_SIZE, 0);
> +               sg_set_folio(&output, folio, PAGE_SIZE, page_idx * PAGE_SIZE);
>                 acomp_request_set_params(acomp_ctx->req, input, &output,
>                                          entry->length, PAGE_SIZE);
>                 ret = crypto_acomp_decompress(acomp_ctx->req);
> @@ -1042,7 +1046,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>                 goto out;
>         }
>
> -       if (!zswap_decompress(entry, folio)) {
> +       if (!zswap_decompress(entry, folio, 0, true)) {
>                 ret = -EIO;
>                 goto out;
>         }
> @@ -1615,10 +1619,9 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
>   *  NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
>   *  will SIGBUS).
>   *
> - *  -EINVAL: if the swapped out content was in zswap, but the page belongs
> - *  to a large folio, which is not supported by zswap. The folio is unlocked,
> - *  but NOT marked up-to-date, so that an IO error is emitted (e.g.
> - *  do_swap_page() will SIGBUS).
> + *  -EAGAIN: if the swapped out content belongs to a large folio, but the
> + *  range is mixed or raced with writeback. The folio remains locked so the
> + *  caller can drop the large swapcache folio and retry order-0.
>   *
>   *  -ENOENT: if the swapped out content was not in zswap. The folio remains
>   *  locked on return.
> @@ -1626,9 +1629,12 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
>  int zswap_load(struct folio *folio)
>  {
>         swp_entry_t swp = folio->swap;
> +       unsigned int nr_pages = folio_nr_pages(folio);
> +       unsigned int type = swp_type(swp);
>         pgoff_t offset = swp_offset(swp);
> -       struct xarray *tree = swap_zswap_tree(swp);
> +       struct xarray *tree;
>         struct zswap_entry *entry;
> +       unsigned int i;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1636,21 +1642,84 @@ int zswap_load(struct folio *folio)
>         if (zswap_never_enabled())
>                 return -ENOENT;
>
> -       /*
> -        * Large folios should not be swapped in while zswap is being used, as
> -        * they are not properly handled. Zswap does not properly load large
> -        * folios, and a large folio may only be partially in zswap.
> -        */
> -       if (WARN_ON_ONCE(folio_test_large(folio))) {
> +       if (folio_test_large(folio)) {
> +               struct obj_cgroup *first_objcg = NULL;
> +               bool same_objcg = true;
> +               bool saw_zswap = false;
> +               bool saw_non_zswap = false;
> +
> +               /*
> +                * The locked large swapcache folio now covers the range and
> +                * conflicts with zswap writeback's order-0 swapcache allocation.
> +                * If the range is mixed or an entry disappears, retry order-0.
> +                */
> +               for (i = 0; i < nr_pages; i++) {
> +                       tree = swap_zswap_tree(swp_entry(type, offset + i));
> +                       entry = xa_load(tree, offset + i);
> +                       if (!entry) {
> +                               if (saw_zswap)
> +                                       return -EAGAIN;
> +                               saw_non_zswap = true;
> +                               continue;
> +                       }

Can we use xas_load API here instead of traversing down the tree again
and again?

> +                       if (saw_non_zswap)
> +                               return -EAGAIN;
> +
> +                       if (!saw_zswap)
> +                               first_objcg = entry->objcg;
> +                       else if (entry->objcg != first_objcg)
> +                               same_objcg = false;

Can we get different objcg at this point?

> +                       saw_zswap = true;
> +               }
> +               if (!saw_zswap)
> +                       return -ENOENT;
> +
> +               for (i = 0; i < nr_pages; i++) {
> +                       tree = swap_zswap_tree(swp_entry(type, offset + i));
> +                       entry = xa_load(tree, offset + i);
> +                       if (!entry)
> +                               return -EAGAIN;
> +
> +                       if (!zswap_decompress(entry, folio, i, false)) {
> +                               folio_unlock(folio);
> +                               return -EIO;
> +                       }
> +               }
> +
> +               flush_dcache_folio(folio);
> +               /*
> +                * Keep zswap entries until swap slots are freed. This is a clean
> +                * speculative fill; zswap remains the backing copy if reclaim
> +                * drops the large folio before PTEs are installed.
> +                */
> +               folio_mark_uptodate(folio);
> +               count_vm_events(ZSWPIN, nr_pages);
> +               count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> +
> +               if (same_objcg) {
> +                       if (first_objcg)
> +                               count_objcg_events(first_objcg, ZSWPIN, nr_pages);
> +               } else {
> +                       for (i = 0; i < nr_pages; i++) {
> +                               tree = swap_zswap_tree(swp_entry(type, offset + i));
> +                               entry = xa_load(tree, offset + i);
> +                               if (WARN_ON_ONCE(!entry))
> +                                       continue;
> +                               if (entry->objcg)
> +                                       count_objcg_events(entry->objcg, ZSWPIN, 1);

xas_load() here too?


> +                       }
> +               }
> +
>                 folio_unlock(folio);
> -               return -EINVAL;
> +               return 0;
>         }

>
> +       tree = swap_zswap_tree(swp);
>         entry = xa_load(tree, offset);
>         if (!entry)
>                 return -ENOENT;
>
> -       if (!zswap_decompress(entry, folio)) {
> +       if (!zswap_decompress(entry, folio, 0, true)) {
>                 folio_unlock(folio);
>                 return -EIO;
>         }

I wonder how much of these two paths (order 0 and larger order) can be
unified...

> --
> 2.34.1
>

^ permalink raw reply

* Re: [PATCH] cgroup/cpuset: Free sched domains on rebuild guard failure
From: Waiman Long @ 2026-05-29 18:07 UTC (permalink / raw)
  To: Guopeng Zhang, Tejun Heo, Johannes Weiner, Michal Koutný
  Cc: Chen Ridong, cgroups, linux-kernel, Guopeng Zhang
In-Reply-To: <20260528093742.1792456-1-guopeng.zhang@linux.dev>

On 5/28/26 5:37 AM, Guopeng Zhang wrote:
> From: Guopeng Zhang <zhangguopeng@kylinos.cn>
>
> generate_sched_domains() returns sched-domain masks and optional
> attributes that are normally handed to partition_sched_domains(), which
> takes ownership of them.
>
> rebuild_sched_domains_locked() has a WARN guard after
> generate_sched_domains() and before partition_sched_domains() to avoid
> passing offline CPUs into the scheduler domain rebuild path. If that
> guard fires, the function currently returns directly without freeing
> the generated doms and attr.
>
> Free the generated sched-domain masks and attributes before returning
> from the guard failure path.
>
> Signed-off-by: Guopeng Zhang <zhangguopeng@kylinos.cn>
> ---
>   kernel/cgroup/cpuset.c | 5 ++++-
>   1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 51327333980a..c5fdebc205d8 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1004,8 +1004,11 @@ void rebuild_sched_domains_locked(void)
>   	* prevent the panic.
>   	*/
>   	for (i = 0; doms && i < ndoms; i++) {
> -		if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
> +		if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) {
> +			free_sched_domains(doms, ndoms);
> +			kfree(attr);
>   			return;
> +		}
>   	}
>   
>   	/* Have scheduler rebuild the domains */
This WARN_ON_ONCE() is mainly used to catch bug during code update. It 
shouldn't be triggered in normal use. Anyway, it is a nice-to-have fix.

Reviewed-by:  Waiman Long <longman@redhat.com>





^ permalink raw reply

* Re: [RFC PATCH v2 0/9] mm: support zswap-backed large folio swapin
From: Nhat Pham @ 2026-05-29 18:06 UTC (permalink / raw)
  To: fujunjie
  Cc: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif,
	Chris Li, Johannes Weiner, Yosry Ahmed, David Hildenbrand,
	Hugh Dickins, Roman Gushchin, Shakeel Butt, linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

On Fri, May 29, 2026 at 5:17 AM fujunjie <fujunjie1@qq.com> wrote:
>
> Hi,
>
> This RFC explores large-folio swapin for ranges that are still fully backed
> by zswap.
>
> Large swapin is currently disabled once zswap is in the picture. Anonymous
> faults stop considering large orders after zswap has ever been enabled,
> shmem does the same, and zswap_load() refuses large swapcache folios. That
> keeps mixed zswap/disk cases safe, but it also loses the dense case where
> every slot in an aligned 64K range is still resident in zswap.
>
> The series keeps the policy in common swapin code:
>
>   - zswap reports backend facts and provides the large-folio load helper.
>   - swapin_sync() filters candidate orders by backend range.
>   - all-disk and zeromap ranges keep the existing Kairui large-swapin path.
>   - mixed zswap/disk ranges stay order-0.
>   - all-zswap ranges may use a 64K folio after locality admission.
>   - anon provides locality evidence from VMA hints and PTE young density.
>   - shmem starts with explicit VMA-hint evidence only.
>   - swap readahead uses its existing VMA/cluster window as locality
>     evidence; it does not also run the anon PTE-young rule.
>
> The backend range probe is only a snapshot. If the backend changes after a
> fresh large swapcache folio is allocated, the common path drops that folio
> and falls back to order-0. zswap_load() can also return -EAGAIN for the
> same retry path. If a late fault retry keeps the large folio in swapcache
> instead of deleting it, the cgroup v1 memsw swap owner is committed before
> returning.
>
> This is mTHP/large-folio swapin. The mappings installed by do_swap_page()
> are still PTE mappings, not PMD mappings. The expected win is fewer faults,
> batched PTE/rmap work, and preserving the large folio across zswapin
> instead of rebuilding the working set as order-0 pages.
>
> Prior art: Usama Arif posted a related RFC on 2024-10-18:
>
>   mm: zswap: add support for zswapin of large folios
>   https://lore.kernel.org/linux-mm/20241018105026.2521366-1-usamaarif642@gmail.com/
>
> This RFC keeps the same broad goal, but moves admission into common swapin
> code. zswap does not decide the policy. Mixed zswap/disk ranges are
> rejected before large IO, and the first cap is 64K.
>
> This is a rewrite of the RFC posted on 2026-05-08:
>
>   [RFC PATCH 0/5] mm: support zswap-backed anonymous large folio swapin
>   https://lore.kernel.org/linux-mm/tencent_8B437BE4F586C162950BF71954316C1EDB05@qq.com/
>
> The v1 series was anonymous-only and kept too much of the policy near the
> anon fault and zswap paths. This version is rebuilt on top of Kairui Song's
> common swapin infrastructure. It keeps admission in common swapin code,
> rejects mixed zswap/disk large ranges, and adds separate locality producers
> for anon, shmem and swap readahead.
>
> Performance and behavior
> ========================
>
> The A/B tables are 10-run measurements. Elapsed values are seconds,
> shown as mean +/- sample standard deviation. "phase" or "refault" is the
> measured refault subphase. "zswpin" counts zswap loads. "pswpin" counts
> swap-ins from the real swap device; pswpin=0 means the refaults were served
> by zswap even when a disk swap device was configured. "RFC 64K" is the mean
> number of successful 64K swapins.
>
> The numbers below show where the large path is used and where it is
> rejected.
>
> zram-backed zswap microbench, 64K mTHP, 8G guest:
>
> +-----------------+----------------+----------------+--------+--------+--------+----------+
> | workload        | base elapsed   | RFC elapsed    | delta  | phase  | zswpin | RFC 64K  |
> +-----------------+----------------+----------------+--------+--------+--------+----------+
> | usama_1g        | 11.260+/-0.235 | 10.301+/-0.140 | -8.5%  | -22.2% | 1.000x | 16381.1  |
> | nohint_seq64    |  4.398+/-0.085 |  4.025+/-0.022 | -8.5%  | -21.1% | 1.000x |  6221.1  |
> | seqhint_seq64   |  4.283+/-0.060 |  3.948+/-0.062 | -7.8%  | -20.6% | 1.000x |  6223.5  |
> | stride64_sparse |  3.095+/-0.051 |  3.086+/-0.025 | -0.3%  |  +5.8% | 1.002x |     1.0  |
> | random64_sparse |  3.095+/-0.046 |  3.076+/-0.016 | -0.6%  |  +0.7% | 1.001x |     0.0  |
> | random64_full   |  4.423+/-0.067 |  4.405+/-0.018 | -0.4%  |  +0.1% | 1.000x |     0.0  |
> +-----------------+----------------+----------------+--------+--------+--------+----------+
>
> The usama_1g row follows the shape of the 2024 RFC benchmark: allocate 1G,
> fill it with compressible per-page data, reclaim it through memory.reclaim,
> then time the full integrity-check refault. The seq64 rows use a 512M
> target and 768M of pressure. "sparse" touches one 4K page per 64K region, while
> "full" touches every 4K page. "seqhint" uses MADV_SEQUENTIAL; "nohint" does
> not.
>
> Virtio-block swap device present, zswap enabled:
>
> +-----------------+---------------+---------------+--------+---------+--------+--------+---------+
> | workload        | base elapsed  | RFC elapsed   | delta  | refault | pswpin | zswpin | RFC 64K |
> +-----------------+---------------+---------------+--------+---------+--------+--------+---------+
> | seq64           | 4.399+/-0.100 | 4.279+/-0.216 | -2.7%  | -10.5%  | 0      | 1.000x | 3110.7  |
> | stride64_sparse | 3.103+/-0.047 | 3.119+/-0.086 | +0.5%  |  +6.2%  | 0      | 0.999x |    0.0  |
> | random64_sparse | 3.142+/-0.112 | 3.097+/-0.030 | -1.4%  |  -2.2%  | 0      | 0.999x |    0.1  |
> | random64_full   | 4.473+/-0.147 | 4.445+/-0.088 | -0.6%  |  +0.9%  | 0      | 1.000x |    0.4  |
> +-----------------+---------------+---------------+--------+---------+--------+--------+---------+
>
> This run uses a real block swap device, but the refaulted data stayed in
> zswap. It covers the all-zswap hit path with disk swap configured, not disk
> read IO.
>
> Virtio-block pressure/mixed run, zswap max_pool_percent=1,
> low-compressibility full fill:
>
> +-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+
> | workload                      | base elapsed  | RFC elapsed   | delta  | refault | pswpin base/RFC | RFC zswpin | RFC 64K | fallback |
> +-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+
> | seq64_full_pressure           | 5.908+/-0.195 | 5.790+/-0.235 | -2.0%  |  +3.0%  | 90258/99038    | 20327      |   0.0   | 3730     |
> | random64_sparse_full_pressure | 5.104+/-0.069 | 5.068+/-0.090 | -0.7%  |  -9.1%  |  6201/6196     |  1297      |   0.0   |    0     |
> +-------------------------------+---------------+---------------+--------+---------+----------------+------------+---------+----------+
>
> This run reaches the disk-backed path: pswpin is non-zero in both base and
> RFC. It is mainly fallback coverage. The RFC does not install 64K folios
> for these disk/mixed-heavy ranges.

Ok this results above look good. Basically, if we don't have spatial
locality in access patterns, we don't do THP zswapin. Nice.

>
> Policy matrix, virtio-block swap device present:
>
> +------------------------------+----+------+--------+--------+-------+----------+
> | case                         | pc | hint | pswpin | zswpin | zswpwb| 64K in   |
> +------------------------------+----+------+--------+--------+-------+----------+
> | pc0_seq                      | 0  | none | 0      | 99559  | 0     | 0        |
> | pc3_seq                      | 3  | none | 0      | 99498  | 0     | 0        |
> | pc4_seq                      | 4  | none | 0      | 99512  | 0     | 3109     |
> | pc5_seq                      | 5  | none | 0      | 99657  | 0     | 3113     |
> | hint_none_random_sparse      | 5  | none | 0      |  6265  | 0     | 0        |
> | hint_random_seq              | 5  | rand | 0      | 99488  | 0     | 0        |
> | mixed_seq_full               | 5  | none | 97725  | 20147  | 84    | 569      |
> | mixed_random_sparse_full     | 5  | none |  6230  |  1302  | 0     | 0        |
> +------------------------------+----+------+--------+--------+-------+----------+
>
> The pc rows show the readahead-window gate. The hint_random_seq row shows
> the explicit random hint veto. The mixed rows use a small zswap pool to
> force disk/zswap split backing; most mixed ranges are rejected, while any
> remaining 64K successes were all-zswap at the time of the fault.
>
> Kbuild pressure, zram swap, 384M memcg:
>
> +----------------------+----------+----------+--------+--------+----------+
> | setup                | base     | RFC      | delta  | zswpin | RFC 64K  |
> +----------------------+----------+----------+--------+--------+----------+
> | zram swap, 384M memcg| 2060.323 | 2047.516 | -0.6%  | 0.991x | 2797     |
> +----------------------+----------+----------+--------+--------+----------+
>
> This is a single-run zram pressure smoke. It did not show Kbuild
> regression, and the RFC run installed 64K zswap-backed folios. The result
> should not be read as a tuned-performance claim.
>
> Kbuild pressure, virtio-block swap device, 512M memcg:
>
> +-------------------------+----------+----------+--------+--------+----------+
> | setup                   | base     | RFC      | delta  | pswpin | RFC 64K  |
> +-------------------------+----------+----------+--------+--------+----------+
> | disk swap, 512M memcg   | 1420.671 | 1409.263 | -0.8%  | 0      | 7497     |
> +-------------------------+----------+----------+--------+--------+----------+
>
> This is a single-run pressure smoke. The disk-swap Kbuild run also stayed
> on the all-zswap hit path, so it is pressure coverage with a disk swap device
> present rather than proof of disk-read large swapin.

Why a single-run?

>
> Shmem smoke, tmpfs huge=always, 64K shmem mTHP:
>
> +----------------------------+---------------+---------+-------------+----------+
> | case                       | refault hint  | touched | 64K shmem   | 64K in   |
> +----------------------------+---------------+---------+-------------+----------+
> | nohint_seq                 | none          | 65536   | 4096        | 0        |
> | seq_refault_hint           | sequential    | 65536   | 4096        | 4096     |
> | random_refault_hint_sparse | random        |  4096   | 4096        | 0        |
> +----------------------------+---------------+---------+-------------+----------+
>
> That matches the current shmem producer: explicit sequential refault hints
> allow large zswap swapin; no hint and random hints do not.
>
> What this RFC does not establish
> ================================
>
> The 64K cap is deliberate, but it is not tuned. The anon PTE-young rule is
> only anon evidence. Shmem has the framework and explicit VMA hints in this
> RFC, not a page-cache locality producer. For larger orders, the anon
> producer should probably use bounded sampling instead of walking every PTE
> in a 1M or larger candidate range. The mixed-backend tests cover fallback
> behavior, but this series does not add mixed zswap/disk large IO.

The mixed IO can be deferred, but I think we should figure out a rule
to extend this hint to arbitrarily sized ranges, and preferrably shmem
too.

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: Yury Norov @ 2026-05-29 17:47 UTC (permalink / raw)
  To: Joshua Hahn
  Cc: Andrew Morton, David Hildenbrand, Zi Yan, Matthew Brost,
	Rakie Kim, Byungchul Park, Gregory Price, Ying Huang,
	Alistair Popple, linux-mm, linux-kernel, Farhad Alemi,
	Waiman Long, Rasmus Villemoes, cgroups
In-Reply-To: <20260529152616.2308736-1-joshua.hahnjy@gmail.com>

On Fri, May 29, 2026 at 08:26:15AM -0700, Joshua Hahn wrote:
> On Thu, 28 May 2026 12:41:33 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Thu, 28 May 2026 15:03:37 -0400 Yury Norov <ynorov@nvidia.com> wrote:
> > 
> > > Reassigning nodes relative an empty user-provided nodemask is useless,
> > > and triggers divide-by-zero in the function.
> > > 
> > > Reported-by: Farhad Alemi <farhad.alemi@berkeley.edu>
> > > Link: https://lore.kernel.org/all/CA+0ovCgxbZkXa+OU8w3s84R3KNPNxxRfmsNR-udh+afQBbGNmw@mail.gmail.com/
> > 
> > Thanks both.
> > 
> > It looks like this is very old code, so we'll be wanting a cc:stable in
> > this.
> > 
> > > --- a/mm/mempolicy.c
> > > +++ b/mm/mempolicy.c
> > > @@ -370,8 +370,13 @@ static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
> > >  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
> > >  				   const nodemask_t *rel)
> > >  {
> > > +	unsigned int w = nodes_weight(*rel);
> > >  	nodemask_t tmp;
> > > -	nodes_fold(tmp, *orig, nodes_weight(*rel));
> > > +
> > > +	if (w == 0)
> > > +		return -EINVAL;
> > > +
> > > +	nodes_fold(tmp, *orig, w);
> > >  	nodes_onto(*ret, tmp, *rel);
> > >  }
> > 
> > I suspect we should address this at the mpol level - it should never
> > have got that far.  Hopefully the mempolicy maintainers can have a
> > think.
> 
> Hello Andrew, hello Yury,
> 
> I agree with Andrew here.
> mpol_relative_nodemask is called from two places, the first being
> mpol_rebind_nodemask which is the calling function seen in the bug report as
> well.
> 
> The other place is mpol_set_nodemask, which has a helpful comment that notes:
> "mpol_set_nodemask is called after mpol_new() [...snip...] mpol_new() has
> already validated the nodes parameter with respect to the policy mode and
> flags".
> 
> So it seems like we are missing the big if-else if-else if block from mpol_new
> in other places that should in fact have it, like mpol_rebind_nodemask.
> 
> The approach proposed here of just checking whether the node weight is 0
> won't work for a few cases, namely for MPOL_DEFAULT and MPOL_PREFERRED where
> empty nodemasks are actually allowed. So what should really be done here is to
> do the full policy-nodemask checking section in mpol_new and call that from
> mpol_set_nodemask as well.
> 
> Thank you for taking a shot at fixing the bug report, please let me know what
> you think! Have a great day : -)

Hi Joshua.

Indeed, quick and dirty shot.

The problem is that nodes_fold() can't work with the sz == 0. In
other words, folding to a 0-bit bitmap is an error. We don't check
that on bitmaps level because it's an internal helper, and it's a
caller's responsibility to validate the parameters.

nodes_onto(), or more specifically bitmap_onto(), is a different
story. In case of empty relmap, the function actually clears all the
bits in dst and returns.

I see 2 options to move this forward.

1. Simply disallow empty relmap in mpol_relative_nodemask(). There's
no valid cases for it, AFAIK, so the nodes_fold() limitation looks
reasonable. We can consider it as a new policy.

We've got 2 users for mpol_relative_nodemask(). In mpol_set_nodemask()
we can simply propagate the error; and in mpol_rebind_nodemask() we
can throw a warning and do nothing.

2. Follow the spirit of the nodes_onto(), and in case of empty
relmask, clean the ret mask and bail out

I'm in a favor for the 1st option, because empty relmask looks buggy
anyways.

> The approach proposed here of just checking whether the node weight is 0
> won't work for a few cases, namely for MPOL_DEFAULT and MPOL_PREFERRED where
> empty nodemasks are actually allowed.

Not sure I understand this. The mpol_relative_nodemask() is called
only if MPOL_F_RELATIVE_NODES is set. In mpol_rebind_nodemask(), if
both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES are set, the former
wins. How would the RELATIVE mode mess with the others?

The mpol_new() code seemingly tries to disable empty nodes in case of
MPOL_DEFAILT and MPOL_PREFERRED + MPOL_F_RELATIVE_NODES, but obviously
it doesn't work very well in the rebind case.

Anyways, I'm not really deep in mempolicy domain, so please educate me if
I miss something.

Thanks,
Yury

^ permalink raw reply

* Re: [PATCH v5 9/9] mm: switch deferred split shrinker to list_lru
From: Kairui Song @ 2026-05-29 17:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes, Shakeel Butt,
	Michal Hocko, Dave Chinner, Roman Gushchin, Muchun Song, Qi Zheng,
	Yosry Ahmed, Zi Yan, Liam R . Howlett, Usama Arif,
	Kiryl Shutsemau, Vlastimil Babka, Mikhail Zaslonko, Vasily Gorbik,
	Baolin Wang, Barry Song, Dev Jain, Lance Yang, Nico Pache,
	Ryan Roberts, cgroups, linux-mm, linux-kernel
In-Reply-To: <20260527204757.2544958-10-hannes@cmpxchg.org>

On Thu, May 28, 2026 at 4:48 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
>
>         alloc/unmap
>           deferred_split_folio()
>             list_add_tail(memcg->split_queue)
>             set_shrinker_bit(memcg, node, deferred_shrinker_id)
>
>         for_each_zone_zonelist_nodemask(restricted_nodes)
>           mem_cgroup_iter()
>             shrink_slab(node, memcg)
>               shrink_slab_memcg(node, memcg)
>                 if test_shrinker_bit(memcg, node, deferred_shrinker_id)
>                   deferred_split_scan()
>                     walks memcg->split_queue
>
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
>
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
>
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_alloc_deferred(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
>
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
>
> Reported-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> Tested-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/huge_mm.h    |   7 +-
>  include/linux/memcontrol.h |   4 -
>  include/linux/mmzone.h     |  12 --
>  mm/huge_memory.c           | 364 +++++++++++++------------------------
>  mm/internal.h              |   2 +-
>  mm/khugepaged.c            |   5 +
>  mm/memcontrol.c            |  12 +-
>  mm/memory.c                |   4 +
>  mm/mm_init.c               |  15 --
>  mm/swap_state.c            |  10 +
>  10 files changed, 150 insertions(+), 285 deletions(-)

...

> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 04f5ce992401..9c3a5cf99778 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -465,6 +465,16 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>                 return ERR_PTR(-ENOMEM);
>         }
>
> +       if (order > 1 && folio_memcg_alloc_deferred(folio)) {
> +               spin_lock(&ci->lock);
> +               __swap_cache_do_del_folio(ci, folio, entry, shadow);
> +               spin_unlock(&ci->lock);
> +               folio_unlock(folio);
> +               /* nr_pages refs from swap cache, 1 from allocation */
> +               folio_put_refs(folio, nr_pages + 1);
> +               return ERR_PTR(-ENOMEM);
> +       }
> +

Thanks!

Nit: I think it would be better if we move the error handling under a
label to be shared with the charge failure above, but it's fine this
way, can be simplified later.

Reviewed-by: Kairui Song <kasong@tencent.com>

^ permalink raw reply

* Re: [PATCH 5/5] cgroup: Defer kill_css_finish() in cgroup_apply_control_disable()
From: Tejun Heo @ 2026-05-29 17:25 UTC (permalink / raw)
  To: Mark Brown
  Cc: Johannes Weiner, Michal Koutný, Sebastian Andrzej Siewior,
	Petr Malat, Bert Karwatzki, kernel test robot, Martin Pitt,
	cgroups, linux-kernel, Aishwarya.TCV
In-Reply-To: <41cd159c-54e5-45e0-81df-eaf36a6c028e@sirena.org.uk>

Hello, Mark.

On Wed, May 27, 2026 at 11:45:54AM +0100, Mark Brown wrote:
> On Mon, May 04, 2026 at 02:51:21PM -1000, Tejun Heo wrote:
> 
> > Same race shape as the rmdir path that 93618edf7538 ("cgroup: Defer css
> > percpu_ref kill on rmdir until cgroup is depopulated") fixed: a task past
> > exit_signals() whose cset subsys[ssid] still pins the disabled controller's
> > css can be touching subsys state while ->css_offline() runs. The earlier
> > patches in this series built up the per-subsys-css deferral machinery and
> > routed cgroup_destroy_locked() through it. Apply the same shape to
> > cgroup_apply_control_disable():
> 
> We've been seeing hangs during testing in our testing of -next on
> multiple arm64 platforms when running LTP test jobs which bisect to this
> patch, which is 1dffd95575eb05bc7e in -next.  It looks like we hit a
> deadlock running stress tests, the end of a typical log looks like this:
> 
> <12>[  181.849144] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_3_3_none: end (returncode: 0)
> <12>[  181.860375] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_3_3_one: start (command: cgroup_fj_stress.sh blkio 3 3 one)
> cgroup_fj_stress_blkio_3_3_one: pass  (1.166s)
> <12>[  183.053379] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_3_3_one: end (returncode: 0)
> <12>[  183.064884] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_4_4_each: start (command: cgroup_fj_stress.sh blkio 4 4 each)
> cgroup_fj_stress_blkio_4_4_each: pass  (8.183s)
> <12>[  191.275815] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_4_4_each: end (returncode: 0)
> <12>[  191.287614] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_4_4_none: start (command: cgroup_fj_stress.sh blkio 4 4 none)
> cgroup_fj_stress_blkio_4_4_none: pass  (3.570s)
> <12>[  194.884173] /opt/ltp/kirk[558]: cgroup_fj_stress_blkio_4_4_none: end (returncode: 0)
> <12>[  194.895255] /opt/ltp/kirk[558]: cgroup_fj_stress_cpu_1_200_each: start (command: cgroup_fj_stress.sh cpu 1 200 each)
> 
> with no further output and given that this is a cgroup locking change
> this does seem like a plausible commmit, though I didn't look into it in
> detail.  Bisect log and the list of LTP tests we're running in our test
> job below.  We are running multuple tests in parallel.

Unfortunately, I can't reproduce this in my environment. Any chance you can
try testing on x86 tooa nd see whether it produces there?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH-next v3 3/5] cgroup/cpuset: Made cpuset_attach_old_cs track task group leaders
From: Waiman Long @ 2026-05-29 16:54 UTC (permalink / raw)
  To: Guopeng Zhang, Chen Ridong, Tejun Heo, Johannes Weiner,
	Michal Koutný, Ingo Molnar, Peter Zijlstra
  Cc: cgroups, linux-kernel, Aaron Tomlin, Ridong Chen
In-Reply-To: <a3f84c49-96cd-4da3-838e-11c72990bc06@linux.dev>

On 5/28/26 10:19 PM, Guopeng Zhang wrote:
>
> 在 2026/5/27 23:37, Waiman Long 写道:
>> There are two possible ways that migration of tasks from multiple source
>> cpusets to a target cpuset can happen. Either a multithread application
>> with threads in different cpusets is wholely moved to a new cpuset
>> or disabling of v2 cpuset controller will move all the tasks in child
>> cpusets to the parent cpuset.
>>
>> In the former case, t is the mm setting of the group leader that really
>> matters. So cpuset_attach_old_cs should track the oldcs of the thread
>> leader. In the latter case, effective_mems of child cpusets must always
>> be a subset of the parent. So no real page migration will be necessary
>> no matter which child cpuset is selected as cpuset_attach_old_cs.
>>
>> IOW, cpuset_attach_old_cs should be updated to match the latest task
>> group leader in cpuset_can_attach().
>>
>> Suggested-by: Ridong Chen <ridong.chen@linux.dev>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>   kernel/cgroup/cpuset.c | 18 ++++++++++++++++++
>>   1 file changed, 18 insertions(+)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 4457c4f11fce..b233a71f9b7c 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -2967,6 +2967,20 @@ static int update_prstate(struct cpuset *cs, int new_prs)
>>   /*
>>    * cpuset_can_attach() and cpuset_attach() specific internal data
>>    * Protected by cpuset_mutex
>> + *
>> + * The cpuset_attach_old_cs is used mainly by cpuset_migrate_mm() tp get the
>> + * old_mems_allowed value. There are two ways that many-to-one cpuset migration
>> + * can happen:
> Hi Waiman,
>
> I applied this series locally and ran some of my test cases. I didn't
> observe any issue so far.
>
> While doing a static/checkpatch pass, I noticed a few minor issues in
> patches 3, 4 and 5. They are all non-functional nits.
>
> For this patch, I only noticed a couple of small wording/typo nits in
> the new comment:
>
> s/tp get/to get/

Thanks for the review, will fix the typo in the next version.

Cheers,
Longman


^ permalink raw reply

* Re: [PATCH rdma-next v2 3/3] cgroup/rdma: update cgroup resource list for MR_MEM
From: kernel test robot @ 2026-05-29 16:18 UTC (permalink / raw)
  To: Tao Cui, tj, hannes, mkoutny, leon, jgg
  Cc: oe-kbuild-all, linux-rdma, cgroups, Tao Cui
In-Reply-To: <20260529090733.2242822-4-cui.tao@linux.dev>

Hi Tao,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tj-cgroup/for-next]
[also build test WARNING on next-20260528]
[cannot apply to linus/master v7.1-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Tao-Cui/cgroup-rdma-extend-charge-uncharge-API-with-s64-amount-parameter/20260529-171623
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-next
patch link:    https://lore.kernel.org/r/20260529090733.2242822-4-cui.tao%40linux.dev
patch subject: [PATCH rdma-next v2 3/3] cgroup/rdma: update cgroup resource list for MR_MEM
config: i386-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260529/202605291816.15AyhoZE-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605291816.15AyhoZE-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605291816.15AyhoZE-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Warning: kernel/cgroup/rdma.c:210 function parameter 'amount' not described in 'uncharge_cg_locked'
>> Warning: kernel/cgroup/rdma.c:312 function parameter 'amount' not described in 'rdmacg_uncharge_hierarchy'
>> Warning: kernel/cgroup/rdma.c:335 function parameter 'amount' not described in 'rdmacg_uncharge'
>> Warning: kernel/cgroup/rdma.c:210 function parameter 'amount' not described in 'uncharge_cg_locked'
>> Warning: kernel/cgroup/rdma.c:312 function parameter 'amount' not described in 'rdmacg_uncharge_hierarchy'
>> Warning: kernel/cgroup/rdma.c:335 function parameter 'amount' not described in 'rdmacg_uncharge'

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: Joshua Hahn @ 2026-05-29 15:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Yury Norov, David Hildenbrand, Zi Yan, Matthew Brost, Rakie Kim,
	Byungchul Park, Gregory Price, Ying Huang, Alistair Popple,
	linux-mm, linux-kernel, Farhad Alemi, Waiman Long,
	Rasmus Villemoes, cgroups
In-Reply-To: <20260528124133.c88c27b11a8ea0ef05e494f7@linux-foundation.org>

On Thu, 28 May 2026 12:41:33 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu, 28 May 2026 15:03:37 -0400 Yury Norov <ynorov@nvidia.com> wrote:
> 
> > Reassigning nodes relative an empty user-provided nodemask is useless,
> > and triggers divide-by-zero in the function.
> > 
> > Reported-by: Farhad Alemi <farhad.alemi@berkeley.edu>
> > Link: https://lore.kernel.org/all/CA+0ovCgxbZkXa+OU8w3s84R3KNPNxxRfmsNR-udh+afQBbGNmw@mail.gmail.com/
> 
> Thanks both.
> 
> It looks like this is very old code, so we'll be wanting a cc:stable in
> this.
> 
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -370,8 +370,13 @@ static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
> >  static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
> >  				   const nodemask_t *rel)
> >  {
> > +	unsigned int w = nodes_weight(*rel);
> >  	nodemask_t tmp;
> > -	nodes_fold(tmp, *orig, nodes_weight(*rel));
> > +
> > +	if (w == 0)
> > +		return -EINVAL;
> > +
> > +	nodes_fold(tmp, *orig, w);
> >  	nodes_onto(*ret, tmp, *rel);
> >  }
> 
> I suspect we should address this at the mpol level - it should never
> have got that far.  Hopefully the mempolicy maintainers can have a
> think.

Hello Andrew, hello Yury,

I agree with Andrew here.
mpol_relative_nodemask is called from two places, the first being
mpol_rebind_nodemask which is the calling function seen in the bug report as
well.

The other place is mpol_set_nodemask, which has a helpful comment that notes:
"mpol_set_nodemask is called after mpol_new() [...snip...] mpol_new() has
already validated the nodes parameter with respect to the policy mode and
flags".

So it seems like we are missing the big if-else if-else if block from mpol_new
in other places that should in fact have it, like mpol_rebind_nodemask.

The approach proposed here of just checking whether the node weight is 0
won't work for a few cases, namely for MPOL_DEFAULT and MPOL_PREFERRED where
empty nodemasks are actually allowed. So what should really be done here is to
do the full policy-nodemask checking section in mpol_new and call that from
mpol_set_nodemask as well.

Thank you for taking a shot at fixing the bug report, please let me know what
you think! Have a great day : -)
Joshua

^ permalink raw reply

* Re: [PATCH v2 1/2] mm/memcontrol: add dmem charge/uncharge functions
From: Michal Koutný @ 2026-05-29 14:56 UTC (permalink / raw)
  To: Eric Chanudet
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt,
	Muchun Song, Andrew Morton, Maarten Lankhorst, Maxime Ripard,
	Natalie Vock, Tejun Heo, Jonathan Corbet, Shuah Khan, cgroups,
	linux-mm, linux-kernel, dri-devel, T.J. Mercier,
	Christian König, Maxime Ripard, Albert Esteve, Dave Airlie,
	linux-doc
In-Reply-To: <20260519-cgroup-dmem-memcg-double-charge-v2-1-db4d1407062b@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 838 bytes --]

On Tue, May 19, 2026 at 11:59:01AM -0400, Eric Chanudet <echanude@redhat.com> wrote:
> +/**
> + * mem_cgroup_dmem_uncharge - uncharge memcg from a dmem pool allocation
> + * @cgrp: cgroup of the dmem pool
> + * @nr_pages: number of pages to uncharge
> + */
> +void mem_cgroup_dmem_uncharge(struct cgroup *cgrp, unsigned int nr_pages)
> +{
> +	struct cgroup_subsys_state *mem_css;
> +	struct mem_cgroup *memcg;
> +
> +	/* CGROUP_DMEM and MEMCG guarantees this cannot be NULL. */
> +	mem_css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
> +
> +	memcg = mem_cgroup_from_css(mem_css);
> +	if (!memcg || mem_cgroup_is_root(memcg)) {
> +		css_put(mem_css);
> +		return;
> +	}
> +
> +	mod_memcg_state(memcg, MEMCG_DMEM, -nr_pages);
> +	refill_stock(memcg, nr_pages);

This doesn't look right.
Here should be memcg_uncharge().

Regards,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply

* Re: [PATCH v2 1/2] mm/memcontrol: add dmem charge/uncharge functions
From: Michal Koutný @ 2026-05-29 14:52 UTC (permalink / raw)
  To: Eric Chanudet
  Cc: Shakeel Butt, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Andrew Morton, Maarten Lankhorst, Maxime Ripard,
	Natalie Vock, Tejun Heo, Jonathan Corbet, Shuah Khan, cgroups,
	linux-mm, linux-kernel, dri-devel, T.J. Mercier,
	Christian König, Maxime Ripard, Albert Esteve, Dave Airlie,
	linux-doc
In-Reply-To: <ahWfypvuTVsB-pHQ@x1nano>

[-- Attachment #1: Type: text/plain, Size: 1201 bytes --]

On Wed, May 27, 2026 at 03:10:47PM -0400, Eric Chanudet <echanude@redhat.com> wrote:
> but that made me realize there is a catch with
> this patch set, with something like:
> A: +memory{max:32M}/+dmem
> A/B: +memory{max:16M}
> 
> It gets the CSS from the dmem's cgroup with
>   cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
>   mem_cgroup_from_css(mem_css);
> 
> Which would resolve to A's memcg and not enforce the memory.max limit
> set in B when dmem.memcg is set for that region.

One perspective is that this is in accordance with dmem's limit granularity.
If the user wanted to distinguish dmem charges below A, they need to
enable the controller there too. IOW, the depends_on in one direction is
correct. dmem is primary when it comes to those charges and memcg
secondary.

Another possibility would be to always use the highest precision
available (wrt where current resides) and then the API should refer to
struct cgroup from task_dfl_cgroup(current) (and make this only
available on v2), or to struct css_set and extract respective subsys
csses in the double charging function.

In either case, it's worth mentioning the behavior in the dmem docs.

HTH,
Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply

* Re: [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()
From: Kairui Song @ 2026-05-29 14:45 UTC (permalink / raw)
  To: fujunjie
  Cc: Andrew Morton, linux-mm, Alexandre Ghiti, Usama Arif, Chris Li,
	Johannes Weiner, Yosry Ahmed, Nhat Pham, David Hildenbrand,
	Hugh Dickins, Roman Gushchin, Shakeel Butt, linux-kernel, cgroups
In-Reply-To: <CAMgjq7AA_1esgtA8VyxaBLWBBRM12bCBpxO2Jch5OESBZSg--A@mail.gmail.com>

On Fri, May 29, 2026 at 10:43 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> Hi Fujunjie,
>
> Thanks for the update, but this whole defer_memcg1_swapin thing is so
> ugly I don't think this is the right way at all.
>
> If you really need this, maybe you can always defer the memcg1

Oh and I'm not saying I'm against this series or the idea, I'm just
saying this particular design of this one patch needs some improvement
:)

^ permalink raw reply

* Re: [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()
From: Kairui Song @ 2026-05-29 14:43 UTC (permalink / raw)
  To: fujunjie
  Cc: Andrew Morton, linux-mm, Alexandre Ghiti, Usama Arif, Chris Li,
	Johannes Weiner, Yosry Ahmed, Nhat Pham, David Hildenbrand,
	Hugh Dickins, Roman Gushchin, Shakeel Butt, linux-kernel, cgroups
In-Reply-To: <tencent_EB78848E34DC7858C873193D67286ECD4B0A@qq.com>

On Fri, May 29, 2026 at 8:26 PM fujunjie <fujunjie1@qq.com> wrote:
>
> A large swapin can only read one folio when the whole range has compatible
> backing. Mixed zswap/disk ranges must not reach large-folio IO, and zswap
> range probes are only snapshots.
>
> Filter the orders passed to swap_cache_alloc_folio() in swapin_sync().
> Uniform zeromap ranges and all-disk ranges keep the existing large swapin
> path. Fully zswap-backed ranges may be tried. Mixed zswap/disk ranges fall
> back before allocation.
>
> After a large swapcache folio is installed, recheck the zswap range and
> drop the fresh folio if it became mixed. Also consume -EAGAIN from
> swap_read_folio() the same way. Both cases retry order-0, where each slot
> can resolve its current backend independently.
>
> Signed-off-by: fujunjie <fujunjie1@qq.com>
> ---
>  mm/memcontrol-v1.c |   8 ++-
>  mm/memory.c        |  31 ++++++++-
>  mm/swap_state.c    | 169 ++++++++++++++++++++++++++++++++++++++++++---
>  3 files changed, 194 insertions(+), 14 deletions(-)
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 765069211567..5b11b8055c66 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -682,8 +682,8 @@ void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
>   * memcg1_swapin - uncharge swap slot on swapin
>   * @folio: folio being swapped in
>   *
> - * Call this function after successfully adding the charged
> - * folio to swapcache.
> + * Call this after the charged folio has been added to swapcache and the caller
> + * is no longer going to drop it back to swapped-out state.
>   *
>   * Context: The folio has to be in swap cache and locked.
>   */
> @@ -721,7 +721,9 @@ void memcg1_swapin(struct folio *folio)
>         id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
>                                  nr_pages);
>         swap_cluster_unlock(ci);
> -       mem_cgroup_uncharge_swap(id, nr_pages);
> +
> +       if (id)
> +               mem_cgroup_uncharge_swap(id, nr_pages);
>  }
>  #endif
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 5a365492a9a2..d73a19692dea 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4538,6 +4538,24 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
>                 folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
>  }
>
> +static void memcg1_swapin_retry_folio(struct folio *folio,
> +                                     struct vm_fault *vmf)
> +{
> +       if (!folio_test_large(folio) || !folio_test_swapcache(folio))
> +               return;
> +
> +       if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
> +               if (!folio_trylock(folio))
> +                       return;
> +       } else {
> +               folio_lock(folio);
> +       }
> +
> +       if (folio_test_large(folio) && folio_test_swapcache(folio))
> +               memcg1_swapin(folio);
> +       folio_unlock(folio);
> +}
> +
>  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
>  {
>         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> @@ -4857,8 +4875,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>
>         swapcache = folio;
>         ret |= folio_lock_or_retry(folio, vmf);
> -       if (ret & VM_FAULT_RETRY)
> +       if (ret & VM_FAULT_RETRY) {
> +               memcg1_swapin_retry_folio(folio, vmf);
>                 goto out_release;
> +       }
>
>         page = folio_file_page(folio, swp_offset(entry));
>         /*
> @@ -5067,6 +5087,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         if (unlikely(folio != swapcache)) {
>                 folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
>                 folio_add_lru_vma(folio, vma);
> +               if (folio_test_large(swapcache))
> +                       memcg1_swapin(swapcache);
>                 folio_put_swap(swapcache, NULL);
>         } else if (!folio_test_anon(folio)) {
>                 /*
> @@ -5076,6 +5098,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                 VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
>                 VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
>                 folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
> +               if (folio_test_large(folio))
> +                       memcg1_swapin(folio);
>                 folio_put_swap(folio, NULL);
>         } else {
>                 VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
> @@ -5132,8 +5156,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         if (vmf->pte)
>                 pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out_page:
> -       if (folio_test_swapcache(folio))
> +       if (folio_test_swapcache(folio)) {
> +               if (folio_test_large(folio))
> +                       memcg1_swapin(folio);
>                 folio_free_swap(folio);
> +       }
>         folio_unlock(folio);
>  out_release:
>         folio_put(folio);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index d37097913b30..f03ad4832f16 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -21,6 +21,7 @@
>  #include <linux/migrate.h>
>  #include <linux/vmalloc.h>
>  #include <linux/huge_mm.h>
> +#include <linux/zswap.h>
>  #include <linux/shmem_fs.h>
>  #include "internal.h"
>  #include "swap_table.h"
> @@ -403,7 +404,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
>  static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>                                         swp_entry_t targ_entry, gfp_t gfp,
>                                         unsigned int order, struct vm_fault *vmf,
> -                                       struct mempolicy *mpol, pgoff_t ilx)
> +                                       struct mempolicy *mpol, pgoff_t ilx,
> +                                       bool defer_memcg1_swapin)

Hi Fujunjie,

Thanks for the update, but this whole defer_memcg1_swapin thing is so
ugly I don't think this is the right way at all.

If you really need this, maybe you can always defer the memcg1
uncharge, I don't see why we need to treat large folio differently.
This charge doesn't effect the memory pressure, the reason we uncharge
memcg1's swap counter is to avoid long pinning swap cache holding the
swap cache of a cgroup so the cgroup will no longer be able to swap
out more folios. Deferring it won't hurt.

>  {
>         int err;
>         swp_entry_t entry;
> @@ -466,7 +468,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>         }
>
>         /* memsw uncharges swap when folio is added to swap cache */
> -       memcg1_swapin(folio);
> +       if (!defer_memcg1_swapin || !order)
> +               memcg1_swapin(folio);
>         if (shadow)
>                 workingset_refault(folio, shadow);
>
> @@ -495,9 +498,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>   * Return: Returns the folio if allocation succeeded and folio is in the swap
>   * cache. Returns error code if failed due to race, OOM or invalid arguments.
>   */
> -struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> -                                    unsigned long orders, struct vm_fault *vmf,
> -                                    struct mempolicy *mpol, pgoff_t ilx)
> +static struct folio *__swap_cache_alloc_folio(swp_entry_t targ_entry,
> +                                             gfp_t gfp, unsigned long orders,
> +                                             struct vm_fault *vmf,
> +                                             struct mempolicy *mpol,
> +                                             pgoff_t ilx,
> +                                             bool defer_memcg1_swapin)
>  {
>         int order, err;
>         struct folio *ret;
> @@ -512,7 +518,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>
>         do {
>                 ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
> -                                        vmf, mpol, ilx);
> +                                        vmf, mpol, ilx,
> +                                        defer_memcg1_swapin);
>                 if (!IS_ERR(ret))
>                         break;
>                 err = PTR_ERR(ret);
> @@ -525,6 +532,124 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>         return ret;
>  }
>
> +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> +                                    unsigned long orders, struct vm_fault *vmf,
> +                                    struct mempolicy *mpol, pgoff_t ilx)
> +{
> +       return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> +                                       mpol, ilx, false);
> +}
> +
> +static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
> +                                                       gfp_t gfp,
> +                                                       unsigned long orders,
> +                                                       struct vm_fault *vmf,
> +                                                       struct mempolicy *mpol,
> +                                                       pgoff_t ilx)
> +{
> +       /*
> +        * Speculative large swapin may drop this fresh swapcache folio and
> +        * retry order-0 after backend or page-table revalidation. Keep the
> +        * cgroup v1 memsw swap owner until the caller commits the folio.
> +        */
> +       return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> +                                       mpol, ilx, true);
> +}
> +
> +static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
> +{
> +       unsigned int ci_start = swp_cluster_offset(entry);
> +       struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
> +       bool is_zero;
> +       unsigned int i;
> +
> +       if (ci_start + nr_pages > SWAPFILE_CLUSTER) {
> +               VM_WARN_ON_ONCE(1);
> +               return false;
> +       }
> +
> +       rcu_read_lock();
> +       if (!rcu_dereference(ci->table)) {
> +               rcu_read_unlock();
> +               return true;
> +       }
> +
> +       is_zero = __swap_table_test_zero(ci, ci_start);
> +       for (i = 1; i < nr_pages; i++) {
> +               if (is_zero != __swap_table_test_zero(ci, ci_start + i)) {
> +                       rcu_read_unlock();
> +                       return false;
> +               }
> +       }
> +       rcu_read_unlock();
> +
> +       return true;
> +}
> +
> +static unsigned long swapin_admit_orders(swp_entry_t entry,
> +                                        unsigned long orders)

And this swapin_admit_orders chunk doesn't look good either...

> +{
> +       unsigned long candidates = orders & ~BIT(0);
> +       unsigned long admitted = orders & BIT(0);
> +       int order;
> +
> +       if (!candidates)
> +               return orders;
> +
> +       while (candidates) {
> +               enum zswap_range_state state;
> +               unsigned int nr_pages;
> +               swp_entry_t range_entry;
> +               bool admit = false;
> +
> +               order = fls_long(candidates) - 1;
> +               if (order > MAX_PAGE_ORDER) {
> +                       candidates &= ~BIT(order);
> +                       continue;
> +               }
> +
> +               nr_pages = 1U << order;
> +               range_entry = swp_entry(swp_type(entry),
> +                                       round_down(swp_offset(entry), nr_pages));
> +               if (!swapin_zeromap_same(range_entry, nr_pages))
> +                       goto next;

I think you don't need to test zeromap at all? __swap_cache_alloc
handles that already.

> +
> +               state = zswap_probe_range(range_entry, nr_pages);

If you just move the zswap_probe_range into __swap_cache_alloc and do
fallback there (or maybe you can shrink the order faster), then this
two new helpers are all redundant.

> +               switch (state) {
> +               case ZSWAP_RANGE_MIXED:
> +                       break;
> +               case ZSWAP_RANGE_ALL_ZSWAP:
> +               case ZSWAP_RANGE_NEVER_ENABLED:
> +               case ZSWAP_RANGE_NO_ZSWAP:
> +                       admit = true;
> +                       break;
> +               }
> +
> +next:
> +               if (admit)
> +                       admitted |= BIT(order);
> +               else
> +                       count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> +               candidates &= ~BIT(order);
> +       }
> +
> +       return admitted ? admitted : BIT(0);
> +}
> +
> +static bool zswap_needs_order0_retry(struct folio *folio)
> +{
> +       if (!folio_test_large(folio))
> +               return false;
> +
> +       /*
> +        * Admission sees only an advisory zswap snapshot. Recheck after the
> +        * large swapcache folio is installed; if the range became mixed, drop
> +        * the fresh folio before IO and let order-0 handle each slot.
> +        */
> +       return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
> +              ZSWAP_RANGE_MIXED;
> +}
> +

Again, I think you can just probe the suitable size in
__swap_cache_alloc directly, that way, we avoid the diverge of sync /
non-sync device, and avoid the whole chunk making the code much
simplier too, just like what we are alreadying doing for zero map in
__swap_cache_alloc, or am I over simpliying it?

^ permalink raw reply

* [tj-cgroup:for-next] BUILD SUCCESS f977fabf943f8b2a0c501b6457d0b4cc239922fa
From: kernel test robot @ 2026-05-29 14:09 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-next
branch HEAD: f977fabf943f8b2a0c501b6457d0b4cc239922fa  Merge branch 'for-7.2' into for-next

elapsed time: 1293m

configs tested: 187
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alpha                             allnoconfig    gcc-15.2.0
alpha                            allyesconfig    gcc-15.2.0
alpha                               defconfig    gcc-15.2.0
arc                              allmodconfig    clang-16
arc                               allnoconfig    gcc-15.2.0
arc                              allyesconfig    clang-23
arc                                 defconfig    gcc-15.2.0
arc                   randconfig-001-20260529    clang-23
arc                   randconfig-002-20260529    clang-23
arm                               allnoconfig    gcc-15.2.0
arm                              allyesconfig    clang-16
arm                                 defconfig    gcc-15.2.0
arm                   randconfig-001-20260529    clang-23
arm                   randconfig-002-20260529    clang-23
arm                   randconfig-003-20260529    clang-23
arm                   randconfig-004-20260529    clang-23
arm64                            allmodconfig    clang-23
arm64                             allnoconfig    gcc-15.2.0
arm64                               defconfig    gcc-15.2.0
arm64                 randconfig-001-20260529    clang-23
arm64                 randconfig-002-20260529    clang-23
arm64                 randconfig-003-20260529    clang-23
arm64                 randconfig-004-20260529    clang-23
csky                             allmodconfig    gcc-15.2.0
csky                              allnoconfig    gcc-15.2.0
csky                                defconfig    gcc-15.2.0
csky                  randconfig-001-20260529    clang-23
csky                  randconfig-002-20260529    clang-23
hexagon                          allmodconfig    gcc-15.2.0
hexagon                           allnoconfig    gcc-15.2.0
hexagon                             defconfig    gcc-15.2.0
hexagon               randconfig-001-20260529    gcc-8.5.0
hexagon               randconfig-002-20260529    gcc-8.5.0
i386                             allmodconfig    clang-20
i386                              allnoconfig    gcc-15.2.0
i386                             allyesconfig    clang-20
i386        buildonly-randconfig-001-20260529    gcc-12
i386        buildonly-randconfig-002-20260529    gcc-12
i386        buildonly-randconfig-003-20260529    gcc-12
i386        buildonly-randconfig-004-20260529    gcc-12
i386        buildonly-randconfig-005-20260529    gcc-12
i386        buildonly-randconfig-006-20260529    gcc-12
i386                                defconfig    gcc-15.2.0
i386                           randconfig-001    gcc-14
i386                  randconfig-001-20260529    gcc-14
i386                           randconfig-002    gcc-14
i386                  randconfig-002-20260529    gcc-14
i386                           randconfig-003    gcc-14
i386                  randconfig-003-20260529    gcc-14
i386                           randconfig-004    gcc-14
i386                  randconfig-004-20260529    gcc-14
i386                           randconfig-005    gcc-14
i386                  randconfig-005-20260529    gcc-14
i386                           randconfig-006    gcc-14
i386                  randconfig-006-20260529    gcc-14
i386                           randconfig-007    gcc-14
i386                  randconfig-007-20260529    gcc-14
i386                  randconfig-011-20260529    gcc-14
i386                  randconfig-012-20260529    gcc-14
i386                  randconfig-013-20260529    gcc-14
i386                  randconfig-014-20260529    gcc-14
i386                  randconfig-015-20260529    gcc-14
i386                  randconfig-016-20260529    gcc-14
i386                  randconfig-017-20260529    gcc-14
loongarch                        allmodconfig    clang-23
loongarch                         allnoconfig    gcc-15.2.0
loongarch                           defconfig    clang-19
loongarch             randconfig-001-20260529    gcc-8.5.0
loongarch             randconfig-002-20260529    gcc-8.5.0
m68k                             allmodconfig    gcc-15.2.0
m68k                              allnoconfig    gcc-15.2.0
m68k                             allyesconfig    clang-16
m68k                                defconfig    clang-19
microblaze                        allnoconfig    gcc-15.2.0
microblaze                       allyesconfig    gcc-15.2.0
microblaze                          defconfig    clang-19
mips                             allmodconfig    gcc-15.2.0
mips                              allnoconfig    gcc-15.2.0
mips                             allyesconfig    gcc-15.2.0
nios2                            alldefconfig    gcc-11.5.0
nios2                            allmodconfig    clang-23
nios2                             allnoconfig    clang-23
nios2                               defconfig    clang-19
nios2                 randconfig-001-20260529    gcc-8.5.0
nios2                 randconfig-002-20260529    gcc-8.5.0
openrisc                         allmodconfig    clang-23
openrisc                          allnoconfig    clang-23
openrisc                            defconfig    gcc-15.2.0
parisc                           allmodconfig    gcc-15.2.0
parisc                            allnoconfig    clang-23
parisc                           allyesconfig    clang-19
parisc                              defconfig    gcc-15.2.0
parisc                         randconfig-001    clang-19
parisc                randconfig-001-20260529    clang-19
parisc                         randconfig-002    clang-19
parisc                randconfig-002-20260529    clang-19
parisc64                            defconfig    clang-19
powerpc                          allmodconfig    gcc-15.2.0
powerpc                           allnoconfig    clang-23
powerpc                        randconfig-001    clang-19
powerpc               randconfig-001-20260529    clang-19
powerpc                        randconfig-002    clang-19
powerpc               randconfig-002-20260529    clang-19
powerpc64                      randconfig-001    clang-19
powerpc64             randconfig-001-20260529    clang-19
powerpc64                      randconfig-002    clang-19
powerpc64             randconfig-002-20260529    clang-19
riscv                            allmodconfig    clang-23
riscv                             allnoconfig    clang-23
riscv                            allyesconfig    clang-16
riscv                               defconfig    gcc-15.2.0
riscv                 randconfig-001-20260529    gcc-15.2.0
riscv                 randconfig-002-20260529    gcc-15.2.0
s390                             allmodconfig    clang-19
s390                              allnoconfig    clang-23
s390                             allyesconfig    gcc-15.2.0
s390                                defconfig    gcc-15.2.0
s390                  randconfig-001-20260529    gcc-15.2.0
s390                  randconfig-002-20260529    gcc-15.2.0
sh                               allmodconfig    gcc-15.2.0
sh                                allnoconfig    clang-23
sh                               allyesconfig    clang-19
sh                                  defconfig    gcc-14
sh                    randconfig-001-20260529    gcc-15.2.0
sh                    randconfig-002-20260529    gcc-15.2.0
sparc                             allnoconfig    clang-23
sparc                               defconfig    gcc-15.2.0
sparc                 randconfig-001-20260529    gcc-11.5.0
sparc                 randconfig-002-20260529    gcc-11.5.0
sparc64                          allmodconfig    clang-23
sparc64                             defconfig    gcc-14
sparc64               randconfig-001-20260529    gcc-11.5.0
sparc64               randconfig-002-20260529    gcc-11.5.0
um                               allmodconfig    clang-19
um                                allnoconfig    clang-23
um                               allyesconfig    gcc-15.2.0
um                                  defconfig    gcc-14
um                             i386_defconfig    gcc-14
um                    randconfig-001-20260529    gcc-11.5.0
um                    randconfig-002-20260529    gcc-11.5.0
um                           x86_64_defconfig    gcc-14
x86_64                           allmodconfig    clang-20
x86_64                            allnoconfig    clang-23
x86_64                           allyesconfig    clang-20
x86_64      buildonly-randconfig-001-20260529    gcc-14
x86_64      buildonly-randconfig-002-20260529    gcc-14
x86_64      buildonly-randconfig-003-20260529    gcc-14
x86_64      buildonly-randconfig-004-20260529    gcc-14
x86_64      buildonly-randconfig-005-20260529    gcc-14
x86_64      buildonly-randconfig-006-20260529    gcc-14
x86_64                              defconfig    gcc-14
x86_64                                  kexec    clang-20
x86_64                         randconfig-001    clang-20
x86_64                randconfig-001-20260529    clang-20
x86_64                         randconfig-002    clang-20
x86_64                randconfig-002-20260529    clang-20
x86_64                         randconfig-003    clang-20
x86_64                randconfig-003-20260529    clang-20
x86_64                         randconfig-004    clang-20
x86_64                randconfig-004-20260529    clang-20
x86_64                         randconfig-005    clang-20
x86_64                randconfig-005-20260529    clang-20
x86_64                         randconfig-006    clang-20
x86_64                randconfig-006-20260529    clang-20
x86_64                randconfig-011-20260529    clang-20
x86_64                randconfig-012-20260529    clang-20
x86_64                randconfig-013-20260529    clang-20
x86_64                randconfig-014-20260529    clang-20
x86_64                randconfig-015-20260529    clang-20
x86_64                randconfig-016-20260529    clang-20
x86_64                randconfig-071-20260529    clang-20
x86_64                randconfig-072-20260529    clang-20
x86_64                randconfig-073-20260529    clang-20
x86_64                randconfig-074-20260529    clang-20
x86_64                randconfig-075-20260529    clang-20
x86_64                randconfig-076-20260529    clang-20
x86_64                               rhel-9.4    clang-20
x86_64                           rhel-9.4-bpf    gcc-14
x86_64                          rhel-9.4-func    clang-20
x86_64                    rhel-9.4-kselftests    clang-20
x86_64                         rhel-9.4-kunit    gcc-14
x86_64                           rhel-9.4-ltp    gcc-14
x86_64                          rhel-9.4-rust    clang-20
xtensa                            allnoconfig    clang-23
xtensa                           allyesconfig    clang-23
xtensa                randconfig-001-20260529    gcc-11.5.0
xtensa                randconfig-002-20260529    gcc-11.5.0

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [tj-cgroup:for-7.2] BUILD SUCCESS 336f87d742a616236006bb77275f79a3ac101637
From: kernel test robot @ 2026-05-29 13:58 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-7.2
branch HEAD: 336f87d742a616236006bb77275f79a3ac101637  cgroup: pair max limit READ_ONCE() with WRITE_ONCE()

elapsed time: 1282m

configs tested: 187
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alpha                             allnoconfig    gcc-15.2.0
alpha                            allyesconfig    gcc-15.2.0
alpha                               defconfig    gcc-15.2.0
arc                              allmodconfig    clang-16
arc                               allnoconfig    gcc-15.2.0
arc                              allyesconfig    clang-23
arc                                 defconfig    gcc-15.2.0
arc                   randconfig-001-20260529    clang-23
arc                   randconfig-002-20260529    clang-23
arm                               allnoconfig    gcc-15.2.0
arm                              allyesconfig    clang-16
arm                                 defconfig    gcc-15.2.0
arm                   randconfig-001-20260529    clang-23
arm                   randconfig-002-20260529    clang-23
arm                   randconfig-003-20260529    clang-23
arm                   randconfig-004-20260529    clang-23
arm64                            allmodconfig    clang-23
arm64                             allnoconfig    gcc-15.2.0
arm64                               defconfig    gcc-15.2.0
arm64                 randconfig-001-20260529    clang-23
arm64                 randconfig-002-20260529    clang-23
arm64                 randconfig-003-20260529    clang-23
arm64                 randconfig-004-20260529    clang-23
csky                             allmodconfig    gcc-15.2.0
csky                              allnoconfig    gcc-15.2.0
csky                                defconfig    gcc-15.2.0
csky                  randconfig-001-20260529    clang-23
csky                  randconfig-002-20260529    clang-23
hexagon                          allmodconfig    gcc-15.2.0
hexagon                           allnoconfig    gcc-15.2.0
hexagon                             defconfig    gcc-15.2.0
hexagon               randconfig-001-20260529    gcc-8.5.0
hexagon               randconfig-002-20260529    gcc-8.5.0
i386                             allmodconfig    clang-20
i386                              allnoconfig    gcc-15.2.0
i386                             allyesconfig    clang-20
i386        buildonly-randconfig-001-20260529    gcc-12
i386        buildonly-randconfig-002-20260529    gcc-12
i386        buildonly-randconfig-003-20260529    gcc-12
i386        buildonly-randconfig-004-20260529    gcc-12
i386        buildonly-randconfig-005-20260529    gcc-12
i386        buildonly-randconfig-006-20260529    gcc-12
i386                                defconfig    gcc-15.2.0
i386                           randconfig-001    gcc-14
i386                  randconfig-001-20260529    gcc-14
i386                           randconfig-002    gcc-14
i386                  randconfig-002-20260529    gcc-14
i386                           randconfig-003    gcc-14
i386                  randconfig-003-20260529    gcc-14
i386                           randconfig-004    gcc-14
i386                  randconfig-004-20260529    gcc-14
i386                           randconfig-005    gcc-14
i386                  randconfig-005-20260529    gcc-14
i386                           randconfig-006    gcc-14
i386                  randconfig-006-20260529    gcc-14
i386                           randconfig-007    gcc-14
i386                  randconfig-007-20260529    gcc-14
i386                  randconfig-011-20260529    gcc-14
i386                  randconfig-012-20260529    gcc-14
i386                  randconfig-013-20260529    gcc-14
i386                  randconfig-014-20260529    gcc-14
i386                  randconfig-015-20260529    gcc-14
i386                  randconfig-016-20260529    gcc-14
i386                  randconfig-017-20260529    gcc-14
loongarch                        allmodconfig    clang-23
loongarch                         allnoconfig    gcc-15.2.0
loongarch                           defconfig    clang-19
loongarch             randconfig-001-20260529    gcc-8.5.0
loongarch             randconfig-002-20260529    gcc-8.5.0
m68k                             allmodconfig    gcc-15.2.0
m68k                              allnoconfig    gcc-15.2.0
m68k                             allyesconfig    clang-16
m68k                                defconfig    clang-19
microblaze                        allnoconfig    gcc-15.2.0
microblaze                       allyesconfig    gcc-15.2.0
microblaze                          defconfig    clang-19
mips                             allmodconfig    gcc-15.2.0
mips                              allnoconfig    gcc-15.2.0
mips                             allyesconfig    gcc-15.2.0
nios2                            alldefconfig    gcc-11.5.0
nios2                            allmodconfig    clang-23
nios2                             allnoconfig    clang-23
nios2                               defconfig    clang-19
nios2                 randconfig-001-20260529    gcc-8.5.0
nios2                 randconfig-002-20260529    gcc-8.5.0
openrisc                         allmodconfig    clang-23
openrisc                          allnoconfig    clang-23
openrisc                            defconfig    gcc-15.2.0
parisc                           allmodconfig    gcc-15.2.0
parisc                            allnoconfig    clang-23
parisc                           allyesconfig    clang-19
parisc                              defconfig    gcc-15.2.0
parisc                         randconfig-001    clang-19
parisc                randconfig-001-20260529    clang-19
parisc                         randconfig-002    clang-19
parisc                randconfig-002-20260529    clang-19
parisc64                            defconfig    clang-19
powerpc                          allmodconfig    gcc-15.2.0
powerpc                           allnoconfig    clang-23
powerpc                        randconfig-001    clang-19
powerpc               randconfig-001-20260529    clang-19
powerpc                        randconfig-002    clang-19
powerpc               randconfig-002-20260529    clang-19
powerpc64                      randconfig-001    clang-19
powerpc64             randconfig-001-20260529    clang-19
powerpc64                      randconfig-002    clang-19
powerpc64             randconfig-002-20260529    clang-19
riscv                            allmodconfig    clang-23
riscv                             allnoconfig    clang-23
riscv                            allyesconfig    clang-16
riscv                               defconfig    gcc-15.2.0
riscv                 randconfig-001-20260529    gcc-15.2.0
riscv                 randconfig-002-20260529    gcc-15.2.0
s390                             allmodconfig    clang-19
s390                              allnoconfig    clang-23
s390                             allyesconfig    gcc-15.2.0
s390                                defconfig    gcc-15.2.0
s390                  randconfig-001-20260529    gcc-15.2.0
s390                  randconfig-002-20260529    gcc-15.2.0
sh                               allmodconfig    gcc-15.2.0
sh                                allnoconfig    clang-23
sh                               allyesconfig    clang-19
sh                                  defconfig    gcc-14
sh                    randconfig-001-20260529    gcc-15.2.0
sh                    randconfig-002-20260529    gcc-15.2.0
sparc                             allnoconfig    clang-23
sparc                               defconfig    gcc-15.2.0
sparc                 randconfig-001-20260529    gcc-11.5.0
sparc                 randconfig-002-20260529    gcc-11.5.0
sparc64                          allmodconfig    clang-23
sparc64                             defconfig    gcc-14
sparc64               randconfig-001-20260529    gcc-11.5.0
sparc64               randconfig-002-20260529    gcc-11.5.0
um                               allmodconfig    clang-19
um                                allnoconfig    clang-23
um                               allyesconfig    gcc-15.2.0
um                                  defconfig    gcc-14
um                             i386_defconfig    gcc-14
um                    randconfig-001-20260529    gcc-11.5.0
um                    randconfig-002-20260529    gcc-11.5.0
um                           x86_64_defconfig    gcc-14
x86_64                           allmodconfig    clang-20
x86_64                            allnoconfig    clang-23
x86_64                           allyesconfig    clang-20
x86_64      buildonly-randconfig-001-20260529    gcc-14
x86_64      buildonly-randconfig-002-20260529    gcc-14
x86_64      buildonly-randconfig-003-20260529    gcc-14
x86_64      buildonly-randconfig-004-20260529    gcc-14
x86_64      buildonly-randconfig-005-20260529    gcc-14
x86_64      buildonly-randconfig-006-20260529    gcc-14
x86_64                              defconfig    gcc-14
x86_64                                  kexec    clang-20
x86_64                         randconfig-001    clang-20
x86_64                randconfig-001-20260529    clang-20
x86_64                         randconfig-002    clang-20
x86_64                randconfig-002-20260529    clang-20
x86_64                         randconfig-003    clang-20
x86_64                randconfig-003-20260529    clang-20
x86_64                         randconfig-004    clang-20
x86_64                randconfig-004-20260529    clang-20
x86_64                         randconfig-005    clang-20
x86_64                randconfig-005-20260529    clang-20
x86_64                         randconfig-006    clang-20
x86_64                randconfig-006-20260529    clang-20
x86_64                randconfig-011-20260529    clang-20
x86_64                randconfig-012-20260529    clang-20
x86_64                randconfig-013-20260529    clang-20
x86_64                randconfig-014-20260529    clang-20
x86_64                randconfig-015-20260529    clang-20
x86_64                randconfig-016-20260529    clang-20
x86_64                randconfig-071-20260529    clang-20
x86_64                randconfig-072-20260529    clang-20
x86_64                randconfig-073-20260529    clang-20
x86_64                randconfig-074-20260529    clang-20
x86_64                randconfig-075-20260529    clang-20
x86_64                randconfig-076-20260529    clang-20
x86_64                               rhel-9.4    clang-20
x86_64                           rhel-9.4-bpf    gcc-14
x86_64                          rhel-9.4-func    clang-20
x86_64                    rhel-9.4-kselftests    clang-20
x86_64                         rhel-9.4-kunit    gcc-14
x86_64                           rhel-9.4-ltp    gcc-14
x86_64                          rhel-9.4-rust    clang-20
xtensa                            allnoconfig    clang-23
xtensa                           allyesconfig    clang-23
xtensa                randconfig-001-20260529    gcc-11.5.0
xtensa                randconfig-002-20260529    gcc-11.5.0

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v5 5/9] mm: list_lru: deduplicate lock_list_lru()
From: Johannes Weiner @ 2026-05-29 13:42 UTC (permalink / raw)
  To: Wei Yang
  Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes, Shakeel Butt,
	Michal Hocko, Dave Chinner, Roman Gushchin, Muchun Song, Qi Zheng,
	Yosry Ahmed, Zi Yan, Liam R . Howlett, Usama Arif,
	Kiryl Shutsemau, Vlastimil Babka, Kairui Song, Mikhail Zaslonko,
	Vasily Gorbik, Baolin Wang, Barry Song, Dev Jain, Lance Yang,
	Nico Pache, Ryan Roberts, cgroups, linux-mm, linux-kernel
In-Reply-To: <20260529095628.nagjdy3f24z6qjtk@master>

On Fri, May 29, 2026 at 09:56:28AM +0000, Wei Yang wrote:
> On Wed, May 27, 2026 at 04:45:12PM -0400, Johannes Weiner wrote:
> >The MEMCG and !MEMCG paths have the same pattern. Share the code.
> >
> >Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
> >Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> >Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> >Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> >Reviewed-by: Liam R. Howlett (Oracle) <liam@infradead.org>
> >---
> > mm/list_lru.c | 21 +++++++++------------
> > 1 file changed, 9 insertions(+), 12 deletions(-)
> >
> >diff --git a/mm/list_lru.c b/mm/list_lru.c
> >index 7d0523e44010..fdb3fe2ea64f 100644
> >--- a/mm/list_lru.c
> >+++ b/mm/list_lru.c
> >@@ -15,6 +15,14 @@
> > #include "slab.h"
> > #include "internal.h"
> 
> Hi, Johannes
> 
> One very tiny nit below.
> 
> > 
> >+static inline void lock_list_lru(struct list_lru_one *l, bool irq)
> 
> Here we use @irq.
> 
> >+{
> >+	if (irq)
> >+		spin_lock_irq(&l->lock);
> >+	else
> >+		spin_lock(&l->lock);
> >+}
> >+
> > static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
> 
> Here we use @irq_off.
> 
> Do you think it would be nicer to unify the parameter name?

Yes, I think it would be nicer.

Note that I inherited this - we had irq on the lock and irq_off on the
unlock before already. I didn't want to mix even more yak shaving prep
patches into this series.

Mind sending a follow-up patch on top of mm-unstable?

^ permalink raw reply

* Re: [PATCH v3] cgroup/dmem: introduce a peak file
From: Michal Koutný @ 2026-05-29 13:01 UTC (permalink / raw)
  To: Maarten Lankhorst
  Cc: Thadeu Lima de Souza Cascardo, Tejun Heo, Johannes Weiner,
	Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
	Andrew Morton, Jonathan Corbet, Shuah Khan, Maxime Ripard,
	Natalie Vock, Tvrtko Ursulin, cgroups, linux-kernel, linux-mm,
	linux-doc, dri-devel, kernel-dev
In-Reply-To: <89901220-0a43-4668-9d20-aaecc72c58dd@lankhorst.se>

[-- Attachment #1: Type: text/plain, Size: 282 bytes --]

On Fri, May 29, 2026 at 09:34:28AM +0200, Maarten Lankhorst <dev@lankhorst.se> wrote:
> > Reviewed-by: Michal Koutný <mkoutny@suse.com>
> Reviewed-by: Maarten Lankhorst <dev@lankhorst.se>
> 
> With your r-b it's ok to push it to the dmemcg tree?

Please go for it.

Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: kernel test robot @ 2026-05-29 12:47 UTC (permalink / raw)
  To: Yury Norov, Andrew Morton, David Hildenbrand, Zi Yan,
	Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, linux-kernel
  Cc: oe-kbuild-all, Linux Memory Management List, Yury Norov,
	Farhad Alemi, Waiman Long, Rasmus Villemoes, cgroups
In-Reply-To: <20260528190337.878027-1-ynorov@nvidia.com>

Hi Yury,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Yury-Norov/mm-don-t-allow-empty-relative-nodemask-in-mpol_relative_nodemask/20260529-030835
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260528190337.878027-1-ynorov%40nvidia.com
patch subject: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
config: sparc64-randconfig-002-20260529 (https://download.01.org/0day-ci/archive/20260529/202605292049.eaIv99hr-lkp@intel.com/config)
compiler: sparc64-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605292049.eaIv99hr-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605292049.eaIv99hr-lkp@intel.com/

All warnings (new ones prefixed by >>):

   mm/mempolicy.c: In function 'mpol_relative_nodemask':
>> mm/mempolicy.c:377:10: warning: 'return' with a value, in function returning void
      return -EINVAL;
             ^
   mm/mempolicy.c:370:13: note: declared here
    static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                ^~~~~~~~~~~~~~~~~~~~~~


vim +/return +377 mm/mempolicy.c

   369	
   370	static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
   371					   const nodemask_t *rel)
   372	{
   373		unsigned int w = nodes_weight(*rel);
   374		nodemask_t tmp;
   375	
   376		if (w == 0)
 > 377			return -EINVAL;
   378	
   379		nodes_fold(tmp, *orig, w);
   380		nodes_onto(*ret, tmp, *rel);
   381	}
   382	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
From: kernel test robot @ 2026-05-29 12:45 UTC (permalink / raw)
  To: Yury Norov, Andrew Morton, David Hildenbrand, Zi Yan,
	Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, linux-kernel
  Cc: llvm, oe-kbuild-all, Linux Memory Management List, Yury Norov,
	Farhad Alemi, Waiman Long, Rasmus Villemoes, cgroups
In-Reply-To: <20260528190337.878027-1-ynorov@nvidia.com>

Hi Yury,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Yury-Norov/mm-don-t-allow-empty-relative-nodemask-in-mpol_relative_nodemask/20260529-030835
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260528190337.878027-1-ynorov%40nvidia.com
patch subject: [PATCH] mm: don't allow empty relative nodemask in mpol_relative_nodemask()
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260529/202605291432.MbAf9EG6-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605291432.MbAf9EG6-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605291432.MbAf9EG6-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/mempolicy.c:377:3: warning: void function 'mpol_relative_nodemask' should not return a value [-Wreturn-mismatch]
     377 |                 return -EINVAL;
         |                 ^      ~~~~~~~
   1 warning generated.


vim +/mpol_relative_nodemask +377 mm/mempolicy.c

   369	
   370	static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
   371					   const nodemask_t *rel)
   372	{
   373		unsigned int w = nodes_weight(*rel);
   374		nodemask_t tmp;
   375	
   376		if (w == 0)
 > 377			return -EINVAL;
   378	
   379		nodes_fold(tmp, *orig, w);
   380		nodes_onto(*ret, tmp, *rel);
   381	}
   382	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH rdma-next v2 0/3] cgroup/rdma: add MR memory size resource tracking
From: Michal Koutný @ 2026-05-29 12:46 UTC (permalink / raw)
  To: Tao Cui; +Cc: tj, hannes, leon, jgg, linux-rdma, cgroups, Tao Cui
In-Reply-To: <20260529090733.2242822-1-cui.tao@linux.dev>

[-- Attachment #1: Type: text/plain, Size: 1600 bytes --]

Hi.

On Fri, May 29, 2026 at 05:07:30PM +0800, Tao Cui <cui.tao@linux.dev> wrote:
> The real scarce resource in multi-tenant
> deployments is pinned memory: how much physical memory gets registered
> through MRs.
> ...
> 3. Overlap with memory cgroup: mr_mem does not count process memory
>    usage; it represents a per-device DMA registration budget: the
>    amount of memory this cgroup may register through a given HCA.
>    This is a different dimension from what memory cgroup tracks.  An
>    administrator might set mr_mem limits differently per device, which
>    memory cgroup cannot express.
> 
>    In particular, mr_mem tracks the registered memory range associated
>    with the MR rather than exact dynamically pinned pages (e.g. for
>    ODP MRs).  This is a stable, policy-oriented approximation of
>    registration footprint, not an attempt at precise physical page
>    accounting.

IIUC the pinned memory is regular RAM, i.e. it could be controlled with
memcg as needed. Or is there "physical" limit of what can be assigned to
a single device?

BTW, have a look at [1], it'd be good to converge to similar approach
(the current proposal allows distinguishing whether charging should
include or exempt memcg counting). Also it seems, that the dmem
controller could be a one-stop solution for all DMA charges. Please tell
me if there are any distinguishing factors between RDMA devices' memory
and these dmem memory regions.

Thanks,
Michal


[1] https://lore.kernel.org/r/20260519-cgroup-dmem-memcg-double-charge-v2-0-db4d1407062b@redhat.com/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 265 bytes --]

^ permalink raw reply

* [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

A large swapin can only read one folio when the whole range has compatible
backing. Mixed zswap/disk ranges must not reach large-folio IO, and zswap
range probes are only snapshots.

Filter the orders passed to swap_cache_alloc_folio() in swapin_sync().
Uniform zeromap ranges and all-disk ranges keep the existing large swapin
path. Fully zswap-backed ranges may be tried. Mixed zswap/disk ranges fall
back before allocation.

After a large swapcache folio is installed, recheck the zswap range and
drop the fresh folio if it became mixed. Also consume -EAGAIN from
swap_read_folio() the same way. Both cases retry order-0, where each slot
can resolve its current backend independently.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memcontrol-v1.c |   8 ++-
 mm/memory.c        |  31 ++++++++-
 mm/swap_state.c    | 169 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 194 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 765069211567..5b11b8055c66 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -682,8 +682,8 @@ void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
  * memcg1_swapin - uncharge swap slot on swapin
  * @folio: folio being swapped in
  *
- * Call this function after successfully adding the charged
- * folio to swapcache.
+ * Call this after the charged folio has been added to swapcache and the caller
+ * is no longer going to drop it back to swapped-out state.
  *
  * Context: The folio has to be in swap cache and locked.
  */
@@ -721,7 +721,9 @@ void memcg1_swapin(struct folio *folio)
 	id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
 				 nr_pages);
 	swap_cluster_unlock(ci);
-	mem_cgroup_uncharge_swap(id, nr_pages);
+
+	if (id)
+		mem_cgroup_uncharge_swap(id, nr_pages);
 }
 #endif
 
diff --git a/mm/memory.c b/mm/memory.c
index 5a365492a9a2..d73a19692dea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4538,6 +4538,24 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
 		folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
 }
 
+static void memcg1_swapin_retry_folio(struct folio *folio,
+				      struct vm_fault *vmf)
+{
+	if (!folio_test_large(folio) || !folio_test_swapcache(folio))
+		return;
+
+	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
+		if (!folio_trylock(folio))
+			return;
+	} else {
+		folio_lock(folio);
+	}
+
+	if (folio_test_large(folio) && folio_test_swapcache(folio))
+		memcg1_swapin(folio);
+	folio_unlock(folio);
+}
+
 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 {
 	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
@@ -4857,8 +4875,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 	swapcache = folio;
 	ret |= folio_lock_or_retry(folio, vmf);
-	if (ret & VM_FAULT_RETRY)
+	if (ret & VM_FAULT_RETRY) {
+		memcg1_swapin_retry_folio(folio, vmf);
 		goto out_release;
+	}
 
 	page = folio_file_page(folio, swp_offset(entry));
 	/*
@@ -5067,6 +5087,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (unlikely(folio != swapcache)) {
 		folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
 		folio_add_lru_vma(folio, vma);
+		if (folio_test_large(swapcache))
+			memcg1_swapin(swapcache);
 		folio_put_swap(swapcache, NULL);
 	} else if (!folio_test_anon(folio)) {
 		/*
@@ -5076,6 +5098,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
 		VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
 		folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
+		if (folio_test_large(folio))
+			memcg1_swapin(folio);
 		folio_put_swap(folio, NULL);
 	} else {
 		VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
@@ -5132,8 +5156,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
-	if (folio_test_swapcache(folio))
+	if (folio_test_swapcache(folio)) {
+		if (folio_test_large(folio))
+			memcg1_swapin(folio);
 		folio_free_swap(folio);
+	}
 	folio_unlock(folio);
 out_release:
 	folio_put(folio);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d37097913b30..f03ad4832f16 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,7 @@
 #include <linux/migrate.h>
 #include <linux/vmalloc.h>
 #include <linux/huge_mm.h>
+#include <linux/zswap.h>
 #include <linux/shmem_fs.h>
 #include "internal.h"
 #include "swap_table.h"
@@ -403,7 +404,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
 static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
 					swp_entry_t targ_entry, gfp_t gfp,
 					unsigned int order, struct vm_fault *vmf,
-					struct mempolicy *mpol, pgoff_t ilx)
+					struct mempolicy *mpol, pgoff_t ilx,
+					bool defer_memcg1_swapin)
 {
 	int err;
 	swp_entry_t entry;
@@ -466,7 +468,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
 	}
 
 	/* memsw uncharges swap when folio is added to swap cache */
-	memcg1_swapin(folio);
+	if (!defer_memcg1_swapin || !order)
+		memcg1_swapin(folio);
 	if (shadow)
 		workingset_refault(folio, shadow);
 
@@ -495,9 +498,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
  * Return: Returns the folio if allocation succeeded and folio is in the swap
  * cache. Returns error code if failed due to race, OOM or invalid arguments.
  */
-struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
-				     unsigned long orders, struct vm_fault *vmf,
-				     struct mempolicy *mpol, pgoff_t ilx)
+static struct folio *__swap_cache_alloc_folio(swp_entry_t targ_entry,
+					      gfp_t gfp, unsigned long orders,
+					      struct vm_fault *vmf,
+					      struct mempolicy *mpol,
+					      pgoff_t ilx,
+					      bool defer_memcg1_swapin)
 {
 	int order, err;
 	struct folio *ret;
@@ -512,7 +518,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
 
 	do {
 		ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
-					 vmf, mpol, ilx);
+					 vmf, mpol, ilx,
+					 defer_memcg1_swapin);
 		if (!IS_ERR(ret))
 			break;
 		err = PTR_ERR(ret);
@@ -525,6 +532,124 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
 	return ret;
 }
 
+struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
+				     unsigned long orders, struct vm_fault *vmf,
+				     struct mempolicy *mpol, pgoff_t ilx)
+{
+	return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
+					mpol, ilx, false);
+}
+
+static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
+							gfp_t gfp,
+							unsigned long orders,
+							struct vm_fault *vmf,
+							struct mempolicy *mpol,
+							pgoff_t ilx)
+{
+	/*
+	 * Speculative large swapin may drop this fresh swapcache folio and
+	 * retry order-0 after backend or page-table revalidation. Keep the
+	 * cgroup v1 memsw swap owner until the caller commits the folio.
+	 */
+	return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
+					mpol, ilx, true);
+}
+
+static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
+{
+	unsigned int ci_start = swp_cluster_offset(entry);
+	struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+	bool is_zero;
+	unsigned int i;
+
+	if (ci_start + nr_pages > SWAPFILE_CLUSTER) {
+		VM_WARN_ON_ONCE(1);
+		return false;
+	}
+
+	rcu_read_lock();
+	if (!rcu_dereference(ci->table)) {
+		rcu_read_unlock();
+		return true;
+	}
+
+	is_zero = __swap_table_test_zero(ci, ci_start);
+	for (i = 1; i < nr_pages; i++) {
+		if (is_zero != __swap_table_test_zero(ci, ci_start + i)) {
+			rcu_read_unlock();
+			return false;
+		}
+	}
+	rcu_read_unlock();
+
+	return true;
+}
+
+static unsigned long swapin_admit_orders(swp_entry_t entry,
+					 unsigned long orders)
+{
+	unsigned long candidates = orders & ~BIT(0);
+	unsigned long admitted = orders & BIT(0);
+	int order;
+
+	if (!candidates)
+		return orders;
+
+	while (candidates) {
+		enum zswap_range_state state;
+		unsigned int nr_pages;
+		swp_entry_t range_entry;
+		bool admit = false;
+
+		order = fls_long(candidates) - 1;
+		if (order > MAX_PAGE_ORDER) {
+			candidates &= ~BIT(order);
+			continue;
+		}
+
+		nr_pages = 1U << order;
+		range_entry = swp_entry(swp_type(entry),
+					round_down(swp_offset(entry), nr_pages));
+		if (!swapin_zeromap_same(range_entry, nr_pages))
+			goto next;
+
+		state = zswap_probe_range(range_entry, nr_pages);
+		switch (state) {
+		case ZSWAP_RANGE_MIXED:
+			break;
+		case ZSWAP_RANGE_ALL_ZSWAP:
+		case ZSWAP_RANGE_NEVER_ENABLED:
+		case ZSWAP_RANGE_NO_ZSWAP:
+			admit = true;
+			break;
+		}
+
+next:
+		if (admit)
+			admitted |= BIT(order);
+		else
+			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
+		candidates &= ~BIT(order);
+	}
+
+	return admitted ? admitted : BIT(0);
+}
+
+static bool zswap_needs_order0_retry(struct folio *folio)
+{
+	if (!folio_test_large(folio))
+		return false;
+
+	/*
+	 * Admission sees only an advisory zswap snapshot. Recheck after the
+	 * large swapcache folio is installed; if the range became mixed, drop
+	 * the fresh folio before IO and let order-0 handle each slot.
+	 */
+	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
+	       ZSWAP_RANGE_MIXED;
+}
+
 /*
  * If we are the only user, then try to free up the swap cache.
  *
@@ -634,7 +759,8 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
 		folio = swap_cache_get_folio(entry);
 		if (folio)
 			return folio;
-		folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
+		folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL,
+					       mpol, ilx);
 	} while (PTR_ERR(folio) == -EEXIST);
 
 	if (IS_ERR_OR_NULL(folio))
@@ -677,18 +803,43 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
 	struct folio *folio;
 	int ret;
 
+	orders = swapin_admit_orders(entry, orders);
+again:
 	do {
 		folio = swap_cache_get_folio(entry);
 		if (folio)
 			return folio;
-		folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
+		folio = swap_cache_alloc_speculative_folio(entry, gfp, orders,
+							   vmf, mpol, ilx);
 	} while (PTR_ERR(folio) == -EEXIST);
 
 	if (IS_ERR(folio))
 		return folio;
 
+	if (zswap_needs_order0_retry(folio)) {
+		count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN_FALLBACK);
+		/*
+		 * The folio is newly allocated, locked, clean and not uptodate;
+		 * no data has been read into it. Removing it only restores the
+		 * swap table entries so order-0 swapin can resolve a backend
+		 * race without attempting speculative large-folio zswapin.
+		 */
+		swap_cache_del_folio(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+		orders = BIT(0);
+		goto again;
+	}
+
 	ret = swap_read_folio(folio, NULL);
-	VM_WARN_ON_ONCE(ret == -EAGAIN);
+	if (ret == -EAGAIN) {
+		count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN_FALLBACK);
+		swap_cache_del_folio(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+		orders = BIT(0);
+		goto again;
+	}
 	return folio;
 }
 
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 9/9] docs: mm: update THP swapin counter descriptions
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

The THP swapin counter descriptions still describe large swapin as
coming only from non-zswap swap devices. Update them now that
zswap-backed large folio swapin can also increment swpin.

Also describe policy and backend rejection as swpin_fallback cases,
since speculative zswap large swapin can intentionally fall back before
doing large IO.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 23f8d13c2629..59b7a0d09243 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -667,13 +667,14 @@ zswpout
 	piece without splitting.
 
 swpin
-	is incremented every time a huge page is swapped in from a non-zswap
-	swap device in one piece.
+	is incremented every time a huge page is swapped in from swap or
+	zswap in one piece.
 
 swpin_fallback
-	is incremented if swapin fails to allocate or charge a huge page
-	and instead falls back to using huge pages with lower orders or
-	small pages.
+	is incremented if swapin cannot use a huge page and instead falls
+	back to using huge pages with lower orders or small pages. This can
+	happen because allocation or charging fails, or because policy or
+	backend state rejects a speculative large swapin.
 
 swpin_fallback_charge
 	is incremented if swapin fails to charge a huge page and instead
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 8/9] mm: try all-zswap large swapin within swap readahead windows
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

The non-synchronous swap fault path already computes either a VMA-based
or cluster-based readahead window. Use that existing window as locality
evidence for zswap-backed large swapin instead of mixing it with the
synchronous anon/shmem evidence.

The path first prepares the normal readahead window. If the faulting
aligned range is fully covered by that window and is still all-zswap, it
may be loaded as one large folio. If the large attempt fails or a backend
race is detected, the precomputed order-0 readahead window is used
without updating readahead state again.

Mixed zswap/disk ranges remain order-0 only. Disk-backed large swapin is
not added by this change.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     |   6 +-
 mm/swap.h       |   4 +-
 mm/swap_state.c | 434 +++++++++++++++++++++++++++++++++++++++---------
 mm/swapfile.c   |   2 +-
 4 files changed, 360 insertions(+), 86 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7bbb89632000..451375090d83 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5027,13 +5027,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (folio)
 		swap_update_readahead(folio, vma, vmf->address);
 	if (!folio) {
+		unsigned long swapin_orders = thp_swapin_suitable_orders(vmf);
+
 		/*
 		 * Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices.
 		 * The swap device is pinned while checking the flag, matching
 		 * the existing fault path.
 		 */
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
-			unsigned long swapin_orders = thp_swapin_suitable_orders(vmf);
 			unsigned long locality_orders =
 				swapin_anon_locality_orders(vmf, swapin_orders);
 
@@ -5041,7 +5042,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 					    swapin_orders | BIT(0),
 					    locality_orders, vmf, NULL, 0);
 		} else {
-			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
+			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+						 swapin_orders, vmf);
 		}
 
 		if (IS_ERR_OR_NULL(folio)) {
diff --git a/mm/swap.h b/mm/swap.h
index 5d1c81ab49b9..0e1bf9218b5e 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -323,7 +323,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 		struct mempolicy *mpol, pgoff_t ilx);
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
-			struct vm_fault *vmf);
+			       unsigned long orders, struct vm_fault *vmf);
 struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
 			  unsigned long locality_orders, struct vm_fault *vmf,
 			  struct mempolicy *mpol, pgoff_t ilx);
@@ -413,7 +413,7 @@ static inline struct folio *swap_cluster_readahead(swp_entry_t entry,
 }
 
 static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
-			struct vm_fault *vmf)
+				unsigned long orders, struct vm_fault *vmf)
 {
 	return NULL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 80dff6a1ee65..4f1eb0a7f9f5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -678,20 +678,24 @@ static bool swapin_zswap_admit(swp_entry_t entry,
 static unsigned long swapin_admit_orders(swp_entry_t entry,
 					 unsigned long orders,
 					 struct vm_fault *vmf,
-					 unsigned long locality_orders)
+					 unsigned long locality_orders,
+					 bool zswap_only)
 {
 	unsigned long candidates = orders & ~BIT(0);
-	unsigned long admitted = orders & BIT(0);
+	unsigned long admitted = zswap_only ? 0 : orders & BIT(0);
+	enum zswap_range_state fault_zswap_state = ZSWAP_RANGE_NEVER_ENABLED;
 	struct zswap_admit_ctx zswap_ctx = {};
+	bool fault_zswap_checked = false;
 	int order;
 
 	if (!candidates)
-		return orders;
+		return zswap_only ? 0 : orders;
 
 	while (candidates) {
 		enum zswap_range_state state;
 		unsigned int nr_pages;
 		swp_entry_t range_entry;
+		bool zswap_locality;
 		bool admit = false;
 
 		order = fls_long(candidates) - 1;
@@ -703,6 +707,29 @@ static unsigned long swapin_admit_orders(swp_entry_t entry,
 		nr_pages = 1U << order;
 		range_entry = swp_entry(swp_type(entry),
 					round_down(swp_offset(entry), nr_pages));
+		zswap_locality = order <= SWAPIN_ZSWAP_MAX_ORDER &&
+				 swapin_zswap_locality(vmf, order,
+						       locality_orders);
+		/*
+		 * If the faulting slot is already in zswap but this order has
+		 * no zswap locality evidence, a larger range covering the fault
+		 * cannot be admitted: it is either all-zswap or mixed, and both
+		 * require zswap locality. Avoid scanning the whole range on
+		 * sparse/random zswap refaults. If the faulting slot is not in
+		 * zswap, keep the full classification so all-disk large swapin
+		 * can follow the existing policy.
+		 */
+		if (!zswap_locality) {
+			if (zswap_only)
+				goto next;
+			if (!fault_zswap_checked) {
+				fault_zswap_state = zswap_probe_range(entry, 1);
+				fault_zswap_checked = true;
+			}
+			if (fault_zswap_state == ZSWAP_RANGE_ALL_ZSWAP)
+				goto next;
+		}
+
 		if (!swapin_zeromap_same(range_entry, nr_pages))
 			goto next;
 
@@ -718,7 +745,7 @@ static unsigned long swapin_admit_orders(swp_entry_t entry,
 			break;
 		case ZSWAP_RANGE_NEVER_ENABLED:
 		case ZSWAP_RANGE_NO_ZSWAP:
-			admit = true;
+			admit = !zswap_only;
 			break;
 		}
 
@@ -730,21 +757,32 @@ static unsigned long swapin_admit_orders(swp_entry_t entry,
 		candidates &= ~BIT(order);
 	}
 
-	return admitted ? admitted : BIT(0);
+	return admitted ? admitted : (zswap_only ? 0 : BIT(0));
 }
 
-static bool zswap_needs_order0_retry(struct folio *folio)
+static bool zswap_folio_all_zswap(struct folio *folio)
 {
+	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
+	       ZSWAP_RANGE_ALL_ZSWAP;
+}
+
+static bool zswap_needs_fallback(struct folio *folio, bool zswap_only)
+{
+	enum zswap_range_state state;
+
 	if (!folio_test_large(folio))
 		return false;
 
+	state = zswap_probe_range(folio->swap, folio_nr_pages(folio));
+	if (zswap_only)
+		return state != ZSWAP_RANGE_ALL_ZSWAP;
+
 	/*
 	 * Admission sees only an advisory zswap snapshot. Recheck after the
 	 * large swapcache folio is installed; if the range became mixed, drop
 	 * the fresh folio before IO and let order-0 handle each slot.
 	 */
-	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
-	       ZSWAP_RANGE_MIXED;
+	return state == ZSWAP_RANGE_MIXED;
 }
 
 /*
@@ -758,8 +796,7 @@ bool swapin_fault_only_young(struct folio *folio)
 	if (!folio_test_large(folio) || !folio_test_swapcache(folio))
 		return false;
 
-	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
-	       ZSWAP_RANGE_ALL_ZSWAP;
+	return zswap_folio_all_zswap(folio);
 }
 
 /*
@@ -893,34 +930,15 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
 	return folio;
 }
 
-/**
- * swapin_sync - swap-in one or multiple entries skipping readahead.
- * @entry: swap entry indicating the target slot
- * @gfp: memory allocation flags
- * @orders: allocation orders
- * @locality_orders: orders with caller-provided locality evidence
- * @vmf: fault information
- * @mpol: NUMA memory allocation policy to be applied
- * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
- *
- * This allocates a folio suitable for given @orders, or returns the
- * existing folio in the swap cache for @entry. This initiates the IO, too,
- * if needed. @entry is rounded down if @orders allow large allocation.
- *
- * Context: Caller must ensure @entry is valid and pin the swap device with
- * refcount.
- * Return: Returns the folio on success, error code if failed.
- */
-struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp,
-			  unsigned long orders,
-			  unsigned long locality_orders,
-			  struct vm_fault *vmf, struct mempolicy *mpol,
-			  pgoff_t ilx)
+static struct folio *swapin_alloc_read(swp_entry_t entry, gfp_t gfp,
+				       unsigned long orders,
+				       struct vm_fault *vmf,
+				       struct mempolicy *mpol, pgoff_t ilx,
+				       bool retry_order0, bool zswap_only)
 {
 	struct folio *folio;
 	int ret;
 
-	orders = swapin_admit_orders(entry, orders, vmf, locality_orders);
 again:
 	do {
 		folio = swap_cache_get_folio(entry);
@@ -931,19 +949,21 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp,
 	} while (PTR_ERR(folio) == -EEXIST);
 
 	if (IS_ERR(folio))
-		return folio;
+		return retry_order0 ? folio : NULL;
 
-	if (zswap_needs_order0_retry(folio)) {
+	if (zswap_needs_fallback(folio, zswap_only)) {
 		count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN_FALLBACK);
 		/*
 		 * The folio is newly allocated, locked, clean and not uptodate;
 		 * no data has been read into it. Removing it only restores the
-		 * swap table entries so order-0 swapin can resolve a backend
+		 * swap table entries so the fallback path can resolve a backend
 		 * race without attempting speculative large-folio zswapin.
 		 */
 		swap_cache_del_folio(folio);
 		folio_unlock(folio);
 		folio_put(folio);
+		if (!retry_order0)
+			return NULL;
 		orders = BIT(0);
 		goto again;
 	}
@@ -954,12 +974,62 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp,
 		swap_cache_del_folio(folio);
 		folio_unlock(folio);
 		folio_put(folio);
+		if (!retry_order0)
+			return NULL;
 		orders = BIT(0);
 		goto again;
 	}
 	return folio;
 }
 
+/**
+ * swapin_sync - swap-in one or multiple entries skipping readahead.
+ * @entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders
+ * @locality_orders: orders with caller-provided locality evidence
+ * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
+ *
+ * This allocates a folio suitable for given @orders, or returns the
+ * existing folio in the swap cache for @entry. This initiates the IO, too,
+ * if needed. @entry is rounded down if @orders allow large allocation.
+ *
+ * Context: Caller must ensure @entry is valid and pin the swap device with
+ * refcount.
+ * Return: Returns the folio on success, error code if failed.
+ */
+struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp,
+			  unsigned long orders,
+			  unsigned long locality_orders,
+			  struct vm_fault *vmf, struct mempolicy *mpol,
+			  pgoff_t ilx)
+{
+	orders = swapin_admit_orders(entry, orders, vmf,
+				     locality_orders, false);
+	return swapin_alloc_read(entry, gfp, orders, vmf, mpol, ilx,
+				 true, false);
+}
+
+static struct folio *swapin_zswap_large(swp_entry_t entry, gfp_t gfp,
+					unsigned long orders,
+					unsigned long locality_orders,
+					struct vm_fault *vmf,
+					struct mempolicy *mpol, pgoff_t ilx)
+{
+	if (READ_ONCE(page_cluster) <= 0)
+		return NULL;
+
+	orders = swapin_admit_orders(entry, orders, vmf,
+				     locality_orders, true);
+	if (!orders)
+		return NULL;
+
+	return swapin_alloc_read(entry, gfp, orders, vmf, mpol, ilx,
+				 false, true);
+}
+
 /*
  * Locate a page of swap in physical memory, reserving swap cache space
  * and reading the disk if it is not already cached.
@@ -1048,12 +1118,88 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 	return pages;
 }
 
+struct swap_cluster_ra {
+	unsigned long start_offset;
+	unsigned long end_offset;
+	bool readahead;
+};
+
+static void swap_cluster_ra_prepare(swp_entry_t entry,
+				    struct swap_cluster_ra *ra)
+{
+	struct swap_info_struct *si = __swap_entry_to_info(entry);
+	unsigned long entry_offset = swp_offset(entry);
+	unsigned long mask;
+
+	mask = swapin_nr_pages(entry_offset) - 1;
+	ra->readahead = !!mask;
+	ra->start_offset = entry_offset;
+	ra->end_offset = entry_offset;
+	if (!mask)
+		return;
+
+	/* Read a page_cluster sized and aligned cluster around offset. */
+	ra->start_offset = entry_offset & ~mask;
+	ra->end_offset = entry_offset | mask;
+	if (!ra->start_offset)	/* First page is swap header. */
+		ra->start_offset++;
+	if (ra->end_offset >= si->max)
+		ra->end_offset = si->max - 1;
+}
+
+static unsigned long swap_cluster_ra_orders(swp_entry_t entry,
+					    unsigned long orders,
+					    const struct swap_cluster_ra *ra)
+{
+	unsigned long admitted = 0;
+	unsigned long candidates = orders & ~BIT(0);
+	unsigned long entry_offset = swp_offset(entry);
+	int order;
+
+	if (!ra->readahead)
+		return 0;
+
+	while (candidates) {
+		unsigned long nr_pages;
+		unsigned long start_offset;
+		unsigned long end_offset;
+
+		order = fls_long(candidates) - 1;
+		if (order > MAX_PAGE_ORDER) {
+			candidates &= ~BIT(order);
+			continue;
+		}
+
+		nr_pages = 1UL << order;
+		start_offset = round_down(entry_offset, nr_pages);
+		end_offset = start_offset + nr_pages - 1;
+		if (start_offset >= ra->start_offset &&
+		    end_offset <= ra->end_offset)
+			admitted |= BIT(order);
+		candidates &= ~BIT(order);
+	}
+
+	return admitted;
+}
+
+static bool swapin_readahead_skip(unsigned long index,
+				  unsigned long skip_start,
+				  unsigned long skip_end)
+{
+	return skip_start < skip_end &&
+	       index >= skip_start && index < skip_end;
+}
+
 /**
- * swap_cluster_readahead - swap in pages in hope we need them soon
+ * swap_cluster_readahead_win - swap in pages from a prepared swap window
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
  * @mpol: NUMA memory allocation policy to be applied
  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
+ * @ra: readahead window prepared by swap_cluster_ra_prepare()
+ * @skip_start: first offset already covered by @target_folio
+ * @skip_end: offset after the already covered range
+ * @target_folio: target folio to return after queueing the rest of the window
  *
  * Returns the struct folio for entry and addr, after queueing swapin.
  *
@@ -1066,33 +1212,38 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * are used for every page of the readahead: neighbouring pages on swap
  * are fairly likely to have been swapped out from the same node.
  */
-struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
-				     struct mempolicy *mpol, pgoff_t ilx)
+static struct folio *swap_cluster_readahead_win(swp_entry_t entry,
+						gfp_t gfp_mask,
+						struct mempolicy *mpol,
+						pgoff_t ilx,
+						const struct swap_cluster_ra *ra,
+						unsigned long skip_start,
+						unsigned long skip_end,
+						struct folio *target_folio)
 {
 	struct folio *folio;
 	unsigned long entry_offset = swp_offset(entry);
-	unsigned long offset = entry_offset;
-	unsigned long start_offset, end_offset;
-	unsigned long mask;
-	struct swap_info_struct *si = __swap_entry_to_info(entry);
+	unsigned long offset;
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
 	swp_entry_t ra_entry;
 
-	mask = swapin_nr_pages(offset) - 1;
-	if (!mask)
+	if (!ra->readahead)
 		goto skip;
 
-	/* Read a page_cluster sized and aligned cluster around offset. */
-	start_offset = offset & ~mask;
-	end_offset = offset | mask;
-	if (!start_offset)	/* First page is swap header. */
-		start_offset++;
-	if (end_offset >= si->max)
-		end_offset = si->max - 1;
+	if (target_folio &&
+	    skip_start <= ra->start_offset && skip_end > ra->end_offset)
+		goto skip;
 
 	blk_start_plug(&plug);
-	for (offset = start_offset; offset <= end_offset ; offset++) {
+	for (offset = ra->start_offset; offset <= ra->end_offset; offset++) {
+		if (swapin_readahead_skip(offset, skip_start, skip_end)) {
+			if (skip_end > ra->end_offset)
+				break;
+			offset = skip_end - 1;
+			continue;
+		}
+
 		/* Ok, do the async read-ahead now */
 		ra_entry = swp_entry(swp_type(entry), offset);
 		folio = swap_cache_read_folio(ra_entry, gfp_mask, mpol, ilx,
@@ -1105,10 +1256,29 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	swap_read_unplug(splug);
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
+	if (target_folio)
+		return target_folio;
+
 	/* The page was likely read above, so no need for plugging here */
 	return swap_cache_read_folio(entry, gfp_mask, mpol, ilx, NULL, false);
 }
 
+struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
+				     struct mempolicy *mpol, pgoff_t ilx)
+{
+	struct swap_cluster_ra ra;
+
+	swap_cluster_ra_prepare(entry, &ra);
+	return swap_cluster_readahead_win(entry, gfp_mask, mpol, ilx, &ra,
+					 0, 0, NULL);
+}
+
+struct swap_vma_ra {
+	unsigned long start;
+	unsigned long end;
+	int win;
+};
+
 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
 			   unsigned long *end)
 {
@@ -1147,35 +1317,69 @@ static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
 	return win;
 }
 
-/**
- * swap_vma_readahead - swap in pages in hope we need them soon
- * @targ_entry: swap entry of the targeted memory
- * @gfp_mask: memory allocation flags
- * @mpol: NUMA memory allocation policy to be applied
- * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
- * @vmf: fault information
- *
- * Returns the struct folio for entry and addr, after queueing swapin.
- *
- * Primitive swap readahead code. We simply read in a few pages whose
- * virtual addresses are around the fault address in the same vma.
- *
- * Caller must hold read mmap_lock if vmf->vma is not NULL.
- *
+static unsigned long swap_vma_ra_orders(struct vm_fault *vmf,
+					unsigned long orders,
+					const struct swap_vma_ra *ra)
+{
+	unsigned long admitted = 0;
+	unsigned long candidates = orders & ~BIT(0);
+	int order;
+
+	if (ra->win <= 1)
+		return 0;
+
+	while (candidates) {
+		unsigned long size;
+		unsigned long start;
+		unsigned long end;
+
+		order = fls_long(candidates) - 1;
+		if (order > MAX_PAGE_ORDER) {
+			candidates &= ~BIT(order);
+			continue;
+		}
+
+		size = PAGE_SIZE << order;
+		start = ALIGN_DOWN(vmf->address, size);
+		end = start + size;
+		if (start >= ra->start && end <= ra->end)
+			admitted |= BIT(order);
+		candidates &= ~BIT(order);
+	}
+
+	return admitted;
+}
+
+/*
+ * Queue swapin for a precomputed VMA readahead window. The window has already
+ * been accounted in vma->swap_readahead_info, so fallback after a failed
+ * zswap-large attempt does not update readahead state a second time. If
+ * @target_folio is already populated, queue only the part of the window outside
+ * [@skip_start, @skip_end) and return @target_folio.
  */
-static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
-		struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
+static struct folio *swap_vma_readahead_win(swp_entry_t targ_entry,
+					    gfp_t gfp_mask,
+					    struct mempolicy *mpol,
+					    pgoff_t targ_ilx,
+					    struct vm_fault *vmf,
+					    const struct swap_vma_ra *ra,
+					    unsigned long skip_start,
+					    unsigned long skip_end,
+					    struct folio *target_folio)
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
 	struct folio *folio;
 	pte_t *pte = NULL, pentry;
-	int win;
 	unsigned long start, end, addr;
 	pgoff_t ilx = targ_ilx;
 
-	win = swap_vma_ra_win(vmf, &start, &end);
-	if (win == 1)
+	if (ra->win <= 1)
+		goto skip;
+
+	start = ra->start;
+	end = ra->end;
+	if (target_folio && skip_start <= start && skip_end >= end)
 		goto skip;
 
 	ilx = targ_ilx - PFN_DOWN(vmf->address - start);
@@ -1185,6 +1389,18 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		struct swap_info_struct *si = NULL;
 		softleaf_t entry;
 
+		if (swapin_readahead_skip(addr, skip_start, skip_end)) {
+			unsigned long next = min(skip_end, end);
+
+			if (pte) {
+				pte_unmap(pte);
+				pte = NULL;
+			}
+			ilx += PFN_DOWN(next - addr) - 1;
+			addr = next - PAGE_SIZE;
+			continue;
+		}
+
 		if (!pte++) {
 			pte = pte_offset_map(vmf->pmd, addr);
 			if (!pte)
@@ -1220,6 +1436,9 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 	swap_read_unplug(splug);
 	lru_add_drain();
 skip:
+	if (target_folio)
+		return target_folio;
+
 	/* The folio was likely read above, so no need for plugging here */
 	folio = swap_cache_read_folio(targ_entry, gfp_mask, mpol, targ_ilx,
 				      NULL, false);
@@ -1230,25 +1449,78 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
  * swapin_readahead - swap in pages in hope we need them soon
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
+ * @orders: large folio orders suitable for the faulting entry
  * @vmf: fault information
  *
  * Returns the struct folio for entry and addr, after queueing swapin.
  *
- * It's a main entry function for swap readahead. By the configuration,
- * it will read ahead blocks by cluster-based(ie, physical disk based)
- * or vma-based(ie, virtual address based on faulty address) readahead.
+ * This first computes the normal VMA or cluster readahead window. If the
+ * window fully covers an aligned all-zswap range containing the fault, that
+ * range may be swapped in as one large folio. The remaining window is still
+ * queued through the original order-0 readahead path, skipping the already
+ * covered target range and without updating readahead state a second time.
  */
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
-				struct vm_fault *vmf)
+				unsigned long orders, struct vm_fault *vmf)
 {
 	struct mempolicy *mpol;
 	pgoff_t ilx;
 	struct folio *folio;
+	unsigned long ra_orders;
+	bool vma_ra;
 
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
-	folio = swap_use_vma_readahead() ?
-		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
-		swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+	vma_ra = swap_use_vma_readahead();
+	if (vma_ra) {
+		struct swap_vma_ra ra = {};
+		unsigned long skip_start = 0;
+		unsigned long skip_end = 0;
+
+		ra.win = swap_vma_ra_win(vmf, &ra.start, &ra.end);
+		ra_orders = swap_vma_ra_orders(vmf, orders, &ra);
+		if (ra_orders) {
+			folio = swapin_zswap_large(entry, gfp_mask, ra_orders,
+						   ra_orders, vmf, mpol, ilx);
+			if (folio) {
+				skip_start = ALIGN_DOWN(vmf->address,
+							folio_size(folio));
+				skip_end = skip_start + folio_size(folio);
+				folio = swap_vma_readahead_win(entry, gfp_mask,
+							       mpol, ilx, vmf,
+							       &ra, skip_start,
+							       skip_end, folio);
+				goto out;
+			}
+		}
+		folio = swap_vma_readahead_win(entry, gfp_mask, mpol, ilx,
+					       vmf, &ra, 0, 0, NULL);
+	} else {
+		struct swap_cluster_ra ra;
+		unsigned long skip_start = 0;
+		unsigned long skip_end = 0;
+
+		swap_cluster_ra_prepare(entry, &ra);
+		ra_orders = swap_cluster_ra_orders(entry, orders, &ra);
+		if (ra_orders) {
+			folio = swapin_zswap_large(entry, gfp_mask, ra_orders,
+						   ra_orders, vmf, mpol, ilx);
+			if (folio) {
+				skip_start = swp_offset(folio->swap);
+				skip_end = skip_start + folio_nr_pages(folio);
+				folio = swap_cluster_readahead_win(entry,
+								   gfp_mask,
+								   mpol, ilx,
+								   &ra,
+								   skip_start,
+								   skip_end,
+								   folio);
+				goto out;
+			}
+		}
+		folio = swap_cluster_readahead_win(entry, gfp_mask, mpol, ilx,
+						   &ra, 0, 0, NULL);
+	}
+out:
 	mpol_cond_put(mpol);
 
 	return folio;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 615d90867111..3b7e7d8ae89d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2452,7 +2452,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			};
 
 			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-						&vmf);
+						 0, &vmf);
 		}
 		if (!folio) {
 			swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 7/9] mm/shmem: provide VMA-hint locality for zswap large swapin
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

Let the shmem swap fault path pass locality evidence into the common
zswap large-swapin policy. Shmem does not have anon PTE-young density
evidence, so this first step only treats explicit VM_SEQ_READ as
positive evidence and VM_RAND_READ as a veto.

The non-fault shmem readahead path remains unchanged. This keeps large
zswap swapin limited to synchronous shmem faults where the caller
supplies a VMA and the common policy can still fall back to order-0.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/shmem.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index fa99b48ed62b..a5ac35ac85fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -30,6 +30,7 @@
 #include <linux/fileattr.h>
 #include <linux/filelock.h>
 #include <linux/mm.h>
+#include <linux/memcontrol.h>
 #include <linux/random.h>
 #include <linux/sched/signal.h>
 #include <linux/export.h>
@@ -1791,6 +1792,29 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
 	return folio;
 }
 
+static unsigned long shmem_swapin_locality_orders(struct vm_fault *vmf,
+						  unsigned long orders)
+{
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+	unsigned long candidates = orders & ~BIT(0);
+
+	/*
+	 * Shmem does not have anon-style PTE young density evidence. Start with
+	 * explicit VMA access hints; future shmem/page-cache readahead evidence
+	 * can be folded into this producer without changing common swapin policy.
+	 */
+	if (!vma)
+		return 0;
+
+	if (vma->vm_flags & VM_RAND_READ)
+		return 0;
+
+	if (vma->vm_flags & VM_SEQ_READ)
+		return candidates;
+
+	return 0;
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 bool shmem_hpage_pmd_enabled(void)
 {
@@ -2020,18 +2044,22 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
 		struct vm_fault *vmf, pgoff_t index,
 		swp_entry_t entry, int order, gfp_t gfp)
 {
+	unsigned long locality_orders;
+	unsigned long orders;
 	pgoff_t ilx;
 	struct folio *folio;
 	struct mempolicy *mpol;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
-	if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
-	     !zswap_never_enabled())
+	if (vmf && unlikely(userfaultfd_armed(vmf->vma)))
 		order = 0;
 
 again:
+	orders = BIT(order);
+	locality_orders = shmem_swapin_locality_orders(vmf, orders);
 	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
-	folio = swapin_sync(entry, gfp, BIT(order), 0, vmf, mpol, ilx);
+	folio = swapin_sync(entry, gfp, orders, locality_orders, vmf, mpol,
+			    ilx);
 	mpol_cond_put(mpol);
 
 	if (!IS_ERR(folio))
@@ -2339,7 +2367,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (!folio_matches_swap_entry(folio, swap) ||
 	    shmem_confirm_swap(mapping, index, swap) < 0) {
 		error = -EEXIST;
-		goto unlock;
+		goto failed_swapcache;
 	}
 	if (!folio_test_uptodate(folio)) {
 		error = -EIO;
@@ -2369,6 +2397,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (sgp == SGP_WRITE)
 		folio_mark_accessed(folio);
 
+	if (folio_test_large(folio))
+		memcg1_swapin(folio);
 	folio_put_swap(folio, NULL);
 	swap_cache_del_folio(folio);
 	folio_mark_dirty(folio);
@@ -2379,9 +2409,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 failed:
 	if (shmem_confirm_swap(mapping, index, swap) < 0)
 		error = -EEXIST;
+failed_swapcache:
+	if (folio && folio_test_large(folio) && folio_test_swapcache(folio))
+		memcg1_swapin(folio);
 	if (error == -EIO)
 		shmem_set_folio_swapin_error(inode, index, folio, swap);
-unlock:
 	if (folio)
 		folio_unlock(folio);
 failed_nolock:
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH v2 6/9] mm: provide anon locality evidence for zswap large swapin
From: fujunjie @ 2026-05-29 12:19 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, Alexandre Ghiti, Kairui Song, Usama Arif
  Cc: Chris Li, Johannes Weiner, Yosry Ahmed, Nhat Pham,
	David Hildenbrand, Hugh Dickins, Roman Gushchin, Shakeel Butt,
	linux-kernel, cgroups
In-Reply-To: <tencent_98CD9F78E48D08DC005A6471A13CFF28B60A@qq.com>

The common zswap large-swapin policy needs locality evidence from
callers before it can admit a large folio. For anonymous faults, provide
that evidence from existing VMA hints and from the PTE young state left
by earlier zswap-backed large swapins.

Keep non-faulting PTEs old when mapping a speculative all-zswap large
folio. A later fault can then require a dense young previous range before
admitting another large swapin without adding VMA state.

This also removes the old zswap-enabled guard from the THP swapin
candidate scan. The common swapin path now classifies the backend range
and falls back to order-0 for mixed zswap/disk ranges or races.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     | 234 +++++++++++++++++++++++++++++++++++++++++++-----
 mm/swap.h       |   6 ++
 mm/swap_state.c |  15 ++++
 3 files changed, 235 insertions(+), 20 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 92a82008d583..7bbb89632000 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4556,6 +4556,35 @@ static void memcg1_swapin_retry_folio(struct folio *folio,
 	folio_unlock(folio);
 }
 
+static void set_swapin_ptes(struct vm_area_struct *vma,
+			    unsigned long address, pte_t *ptep, pte_t pte,
+			    unsigned int nr_pages, unsigned int fault_pte_idx,
+			    bool fault_only_young)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t old_pte;
+
+	if (!fault_only_young || nr_pages == 1) {
+		set_ptes(mm, address, ptep, pte, nr_pages);
+		return;
+	}
+
+	old_pte = pte_mkold(pte);
+	if (fault_pte_idx)
+		set_ptes(mm, address, ptep, old_pte, fault_pte_idx);
+
+	set_pte_at(mm, address + fault_pte_idx * PAGE_SIZE,
+		   ptep + fault_pte_idx,
+		   pte_mkyoung(pte_advance_pfn(pte, fault_pte_idx)));
+
+	fault_pte_idx++;
+	if (fault_pte_idx < nr_pages)
+		set_ptes(mm, address + fault_pte_idx * PAGE_SIZE,
+			 ptep + fault_pte_idx,
+			 pte_advance_pfn(old_pte, fault_pte_idx),
+			 nr_pages - fault_pte_idx);
+}
+
 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 {
 	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
@@ -4628,6 +4657,157 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define SWAPIN_ANON_YOUNG_MIN_PERCENT		75
+#define SWAPIN_ANON_MAX_FAULT_SKIP_SHIFT	2
+
+static bool swapin_anon_prev_young_dense(struct vm_fault *vmf,
+					 unsigned int order)
+{
+	struct vm_area_struct *vma;
+	unsigned int nr_pages;
+	unsigned int threshold;
+	unsigned long size;
+	unsigned long base, prev, addr;
+	struct folio *first = NULL;
+	unsigned int present = 0;
+	unsigned int young = 0;
+	pmd_t *pmd;
+	pmd_t pmdval;
+	spinlock_t *ptl; /* protects the previous PTE range */
+	pte_t *ptep;
+	unsigned int i;
+
+	if (!IS_ENABLED(CONFIG_MMU) || !arch_has_hw_pte_young() || !vmf ||
+	    !vmf->vma || !vmf->pmd || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	nr_pages = 1U << order;
+	threshold = DIV_ROUND_UP(nr_pages *
+				 SWAPIN_ANON_YOUNG_MIN_PERCENT, 100);
+	size = PAGE_SIZE << order;
+
+	vma = vmf->vma;
+	base = ALIGN_DOWN(vmf->address, size);
+	if (base < size)
+		return false;
+
+	prev = base - size;
+	if (prev < vma->vm_start || prev + size > vma->vm_end)
+		return false;
+
+	pmd = vmf->pmd;
+	if ((prev & PMD_MASK) != (base & PMD_MASK)) {
+		pmd = mm_find_pmd(vma->vm_mm, prev);
+		if (!pmd)
+			return false;
+	}
+
+	pmdval = pmdp_get_lockless(pmd);
+	if (!pmd_present(pmdval) || pmd_leaf(pmdval))
+		return false;
+
+	ptep = pte_offset_map_lock(vma->vm_mm, pmd, prev, &ptl);
+	if (!ptep)
+		return false;
+
+	for (i = 0, addr = prev; i < nr_pages; i++, addr += PAGE_SIZE) {
+		struct folio *folio;
+		pte_t pte = ptep_get(ptep + i);
+
+		if (!pte_present(pte))
+			break;
+
+		folio = vm_normal_folio(vma, addr, pte);
+		if (!folio || folio_order(folio) != order)
+			break;
+		if (!first)
+			first = folio;
+		else if (folio != first)
+			break;
+
+		present++;
+		if (pte_young(pte))
+			young++;
+	}
+
+	pte_unmap_unlock(ptep, ptl);
+	if (present != nr_pages)
+		return false;
+
+	return young >= threshold;
+}
+
+static bool swapin_anon_accessed_neighbour(struct vm_fault *vmf,
+					   unsigned int order)
+{
+	unsigned long size;
+	unsigned long base;
+	unsigned long fault_idx;
+	unsigned long max_skip;
+
+	if (!vmf || !vmf->vma || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	size = PAGE_SIZE << order;
+	base = ALIGN_DOWN(vmf->address, size);
+
+	/*
+	 * Without a sequential hint, require prior young-density evidence and
+	 * only allow faults near the start of the candidate range.
+	 */
+	fault_idx = (vmf->address - base) >> PAGE_SHIFT;
+	max_skip = (1UL << order) >> SWAPIN_ANON_MAX_FAULT_SKIP_SHIFT;
+	if (fault_idx > max_skip)
+		return false;
+
+	return swapin_anon_prev_young_dense(vmf, order);
+}
+
+static bool swapin_anon_fault_starts_range(struct vm_fault *vmf,
+					   unsigned int order)
+{
+	struct vm_area_struct *vma;
+	unsigned long size;
+	unsigned long base;
+	unsigned long first;
+
+	if (!vmf || !vmf->vma || !order || order > MAX_PAGE_ORDER)
+		return false;
+
+	vma = vmf->vma;
+	size = PAGE_SIZE << order;
+	base = ALIGN_DOWN(vmf->address, size);
+	first = ALIGN(vma->vm_start, size);
+
+	return base == first && vmf->address == base &&
+	       base + size <= vma->vm_end;
+}
+
+static unsigned long swapin_anon_locality_orders(struct vm_fault *vmf,
+						 unsigned long orders)
+{
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+	unsigned long locality_orders = 0;
+	unsigned long candidates = orders & ~BIT(0);
+	int order;
+
+	if (vma && (vma->vm_flags & VM_RAND_READ))
+		return 0;
+
+	if (vma && (vma->vm_flags & VM_SEQ_READ))
+		return candidates;
+
+	while (candidates) {
+		order = fls_long(candidates) - 1;
+		if (swapin_anon_fault_starts_range(vmf, order) ||
+		    swapin_anon_accessed_neighbour(vmf, order))
+			locality_orders |= BIT(order);
+		candidates &= ~BIT(order);
+	}
+
+	return locality_orders;
+}
+
 /*
  * Check if the PTEs within a range are contiguous swap entries.
  */
@@ -4644,9 +4824,9 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
 		return false;
 	/*
-	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
+	 * swap_read_folio() can't do mixed-backend large folio IO. The common
+	 * synchronous swapin path will recheck backend state and fall back to
+	 * order-0 if a zswap/disk race makes the range mixed.
 	 */
 	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
 		return false;
@@ -4693,14 +4873,6 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		return 0;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		return 0;
-
 	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
@@ -4708,10 +4880,13 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 	 */
 	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
 					  BIT(PMD_ORDER) - 1);
+	if (!orders)
+		return 0;
 	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+	if (!orders)
+		return 0;
 	orders = thp_swap_suitable_orders(swp_offset(entry),
 					  vmf->address, orders);
-
 	if (!orders)
 		return 0;
 
@@ -4741,6 +4916,12 @@ static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
 {
 	return 0;
 }
+
+static unsigned long swapin_anon_locality_orders(struct vm_fault *vmf,
+						 unsigned long orders)
+{
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /* Sanity check that a folio is fully exclusive */
@@ -4777,6 +4958,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	unsigned long page_idx;
 	unsigned long address;
 	pte_t *ptep;
+	bool fault_only_young = false;
 
 	if (!pte_unmap_same(vmf))
 		goto out;
@@ -4845,13 +5027,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (folio)
 		swap_update_readahead(folio, vma, vmf->address);
 	if (!folio) {
-		/* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
-		if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+		/*
+		 * Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices.
+		 * The swap device is pinned while checking the flag, matching
+		 * the existing fault path.
+		 */
+		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+			unsigned long swapin_orders = thp_swapin_suitable_orders(vmf);
+			unsigned long locality_orders =
+				swapin_anon_locality_orders(vmf, swapin_orders);
+
 			folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
-					    thp_swapin_suitable_orders(vmf) | BIT(0),
-					    0, vmf, NULL, 0);
-		else
+					    swapin_orders | BIT(0),
+					    locality_orders, vmf, NULL, 0);
+		} else {
 			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
+		}
 
 		if (IS_ERR_OR_NULL(folio)) {
 			/*
@@ -5110,9 +5301,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 	VM_BUG_ON(!folio_test_anon(folio) ||
 			(pte_write(pte) && !PageAnonExclusive(page)));
-	set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
-	arch_do_swap_page_nr(vma->vm_mm, vma, address,
-			pte, pte, nr_pages);
+	if (folio == swapcache && nr_pages == folio_nr_pages(folio) &&
+	    arch_has_hw_pte_young())
+		fault_only_young = swapin_fault_only_young(folio);
+	set_swapin_ptes(vma, address, ptep, pte, nr_pages, page_idx,
+			fault_only_young);
+	arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages);
 
 	/*
 	 * Remove the swap entry and conditionally try to free up the swapcache.
diff --git a/mm/swap.h b/mm/swap.h
index dd35a310d06d..5d1c81ab49b9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -327,6 +327,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
 struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
 			  unsigned long locality_orders, struct vm_fault *vmf,
 			  struct mempolicy *mpol, pgoff_t ilx);
+bool swapin_fault_only_young(struct folio *folio);
 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
 			   unsigned long addr);
 
@@ -430,6 +431,11 @@ static inline void swap_update_readahead(struct folio *folio,
 {
 }
 
+static inline bool swapin_fault_only_young(struct folio *folio)
+{
+	return false;
+}
+
 static inline int swap_writeout(struct folio *folio,
 		struct swap_iocb **swap_plug)
 {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5a4ca289009a..80dff6a1ee65 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -747,6 +747,21 @@ static bool zswap_needs_order0_retry(struct folio *folio)
 	       ZSWAP_RANGE_MIXED;
 }
 
+/*
+ * A speculative large swapin may install PTEs for pages that did not fault.
+ * Keep those non-faulting PTEs old so a later anon fault can report
+ * PTE-young density as caller-provided locality evidence without storing
+ * state in the VMA.
+ */
+bool swapin_fault_only_young(struct folio *folio)
+{
+	if (!folio_test_large(folio) || !folio_test_swapcache(folio))
+		return false;
+
+	return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
+	       ZSWAP_RANGE_ALL_ZSWAP;
+}
+
 /*
  * If we are the only user, then try to free up the swap cache.
  *
-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox