Re: [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like devices

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Barry Song <21cnbao@gmail.com>
To: wangkefeng.wang@huawei.com
Cc: akpm@linux-foundation.org, baolin.wang@linux.alibaba.com,
	chrisl@kernel.org, david@redhat.com, hanchuanhua@oppo.com,
	hannes@cmpxchg.org, hch@infradead.org, hughd@google.com,
	kaleshsingh@google.com, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, mhocko@suse.com, minchan@kernel.org,
	nphamcs@gmail.com, ryan.roberts@arm.com, ryncsn@gmail.com,
	senozhatsky@chromium.org, shakeel.butt@linux.dev,
	shy828301@gmail.com, surenb@google.com, v-songbaohua@oppo.com,
	willy@infradead.org, xiang@kernel.org, ying.huang@intel.com,
	yosryahmed@google.com
Subject: Re: [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like devices
Date: Fri, 16 Aug 2024 11:06:12 +1200	[thread overview]
Message-ID: <20240815230612.77266-1-21cnbao@gmail.com> (raw)
In-Reply-To: <20ed69ad-5dad-446b-9f01-86ad8b1c67fa@huawei.com>

On Fri, Aug 16, 2024 at 1:27 AM Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
>
>
>
> On 2024/8/15 17:47, Kairui Song wrote:
> > On Fri, Aug 2, 2024 at 8:21 PM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > Hi Chuanhua,
> >
> >>
> ...
>
> >> +
> >> +static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >> +{
> >> +       struct vm_area_struct *vma = vmf->vma;
> >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >> +       unsigned long orders;
> >> +       struct folio *folio;
> >> +       unsigned long addr;
> >> +       swp_entry_t entry;
> >> +       spinlock_t *ptl;
> >> +       pte_t *pte;
> >> +       gfp_t gfp;
> >> +       int order;
> >> +
> >> +       /*
> >> +        * If uffd is active for the vma we need per-page fault fidelity to
> >> +        * maintain the uffd semantics.
> >> +        */
> >> +       if (unlikely(userfaultfd_armed(vma)))
> >> +               goto fallback;
> >> +
> >> +       /*
> >> +        * A large swapped out folio could be partially or fully in zswap. We
> >> +        * lack handling for such cases, so fallback to swapping in order-0
> >> +        * folio.
> >> +        */
> >> +       if (!zswap_never_enabled())
> >> +               goto fallback;
> >> +
> >> +       entry = pte_to_swp_entry(vmf->orig_pte);
> >> +       /*
> >> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >> +        * and suitable for swapping THP.
> >> +        */
> >> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> >> +                       TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
> >> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> >> +       orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders);
> >> +
> >> +       if (!orders)
> >> +               goto fallback;
> >> +
> >> +       pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl);
> >> +       if (unlikely(!pte))
> >> +               goto fallback;
> >> +
> >> +       /*
> >> +        * For do_swap_page, find the highest order where the aligned range is
> >> +        * completely swap entries with contiguous swap offsets.
> >> +        */
> >> +       order = highest_order(orders);
> >> +       while (orders) {
> >> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >> +               if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
> >> +                       break;
> >> +               order = next_order(&orders, order);
> >> +       }
> >> +
> >> +       pte_unmap_unlock(pte, ptl);
> >> +
> >> +       /* Try allocating the highest of the remaining orders. */
> >> +       gfp = vma_thp_gfp_mask(vma);
> >> +       while (orders) {
> >> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
> >> +               if (folio)
> >> +                       return folio;
> >> +               order = next_order(&orders, order);
> >> +       }
> >> +
> >> +fallback:
> >> +#endif
> >> +       return vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false);
> >> +}
> >> +
> >> +
> >>   /*
> >>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
> >>    * but allow concurrent faults), and pte mapped but not yet locked.
> >> @@ -4074,35 +4220,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >>          if (!folio) {
> >>                  if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> >>                      __swap_count(entry) == 1) {
> >> -                       /*
> >> -                        * Prevent parallel swapin from proceeding with
> >> -                        * the cache flag. Otherwise, another thread may
> >> -                        * finish swapin first, free the entry, and swapout
> >> -                        * reusing the same entry. It's undetectable as
> >> -                        * pte_same() returns true due to entry reuse.
> >> -                        */
> >> -                       if (swapcache_prepare(entry, 1)) {
> >> -                               /* Relax a bit to prevent rapid repeated page faults */
> >> -                               schedule_timeout_uninterruptible(1);
> >> -                               goto out;
> >> -                       }
> >> -                       need_clear_cache = true;
> >> -
> >>                          /* skip swapcache */
> >> -                       folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
> >> -                                               vma, vmf->address, false);
> >> +                       folio = alloc_swap_folio(vmf);
> >>                          page = &folio->page;
> >>                          if (folio) {
> >>                                  __folio_set_locked(folio);
> >>                                  __folio_set_swapbacked(folio);
> >>
> >> +                               nr_pages = folio_nr_pages(folio);
> >> +                               if (folio_test_large(folio))
> >> +                                       entry.val = ALIGN_DOWN(entry.val, nr_pages);
> >> +                               /*
> >> +                                * Prevent parallel swapin from proceeding with
> >> +                                * the cache flag. Otherwise, another thread may
> >> +                                * finish swapin first, free the entry, and swapout
> >> +                                * reusing the same entry. It's undetectable as
> >> +                                * pte_same() returns true due to entry reuse.
> >> +                                */
> >> +                               if (swapcache_prepare(entry, nr_pages)) {
> >> +                                       /* Relax a bit to prevent rapid repeated page faults */
> >> +                                       schedule_timeout_uninterruptible(1);
> >> +                                       goto out_page;
> >> +                               }
> >> +                               need_clear_cache = true;
> >> +
> >>                                  if (mem_cgroup_swapin_charge_folio(folio,
> >>                                                          vma->vm_mm, GFP_KERNEL,
> >>                                                          entry)) {
> >>                                          ret = VM_FAULT_OOM;
> >>                                          goto out_page;
> >>                                  }
> >
> > After your patch, with build kernel test, I'm seeing kernel log
> > spamming like this:
> > [  101.048594] pagefault_out_of_memory: 95 callbacks suppressed
> > [  101.048599] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.059416] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.118575] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.125585] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.182501] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.215351] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.272822] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > [  101.403195] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > ............
> >
> > And heavy performance loss with workloads limited by memcg, mTHP enabled.
> >
> > After some debugging, the problematic part is the
> > mem_cgroup_swapin_charge_folio call above.
> > When under pressure, cgroup charge fails easily for mTHP. One 64k
> > swapin will require a much more aggressive reclaim to success.
> >
> > If I change MAX_RECLAIM_RETRIES from 16 to 512, the spamming log is
> > gone and mTHP swapin should have a much higher swapin success rate.
> > But this might not be the right way.
> >
> > For this particular issue, maybe you can change the charge order, try
> > charging first, if successful, use mTHP. if failed, fallback to 4k?
>
> This is what we did in alloc_anon_folio(), see 085ff35e7636
> ("mm: memory: move mem_cgroup_charge() into alloc_anon_folio()"),
> 1) fallback earlier
> 2) using same GFP flags for allocation and charge
>
> but it seems that there is a little complicated for swapin charge

Kefeng, thanks! I guess we can continue using the same approach and
it's not too complicated. 

Kairui, sorry for the trouble and thanks for the report! could you
check if the solution below resolves the issue? On phones, we don't
encounter the scenarios you’re facing.

From 2daaf91077705a8fa26a3a428117f158f05375b0 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Fri, 16 Aug 2024 10:51:48 +1200
Subject: [PATCH] mm: fallback to next_order if charing mTHP fails

When memcg approaches its limit, charging mTHP becomes difficult.
At this point, when the charge fails, we fallback to the next order
to avoid repeatedly retrying larger orders.

Reported-by: Kairui Song <ryncsn@gmail.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/memory.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0ed3603aaf31..6cba28ef91e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4121,8 +4121,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
-		if (folio)
-			return folio;
+		if (folio) {
+			if (!mem_cgroup_swapin_charge_folio(folio,
+					vma->vm_mm, gfp, entry))
+				return folio;
+			folio_put(folio);
+		}
 		order = next_order(&orders, order);
 	}
 
@@ -4244,7 +4248,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 				}
 				need_clear_cache = true;
 
-				if (mem_cgroup_swapin_charge_folio(folio,
+				if (nr_pages == 1 && mem_cgroup_swapin_charge_folio(folio,
 							vma->vm_mm, GFP_KERNEL,
 							entry)) {
 					ret = VM_FAULT_OOM;
-- 
2.34.1


Thanks
Barry

next prev parent reply	other threads:[~2024-08-15 23:06 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-26  9:46 [PATCH v5 0/4] mm: support mTHP swap-in for zRAM-like swapfile Barry Song
2024-07-26  9:46 ` [PATCH v5 1/4] mm: swap: introduce swapcache_prepare_nr and swapcache_clear_nr for large folios swap-in Barry Song
2024-07-30  3:00   ` Baolin Wang
2024-07-30  3:11   ` Matthew Wilcox
2024-07-30  3:15     ` Barry Song
2024-07-26  9:46 ` [PATCH v5 2/4] mm: Introduce mem_cgroup_swapin_uncharge_swap_nr() helper " Barry Song
2024-07-26 16:30   ` Yosry Ahmed
2024-07-29  2:02     ` Barry Song
2024-07-29  3:43       ` Matthew Wilcox
2024-07-29  4:52         ` Barry Song
2024-07-26  9:46 ` [PATCH v5 3/4] mm: support large folios swapin as a whole for zRAM-like swapfile Barry Song
2024-07-29  3:51   ` Matthew Wilcox
2024-07-29  4:41     ` Barry Song
     [not found]       ` <CAGsJ_4wxUZAysyg3cCVnHhOFt5SbyAMUfq3tJcX-Wb6D4BiBhA@mail.gmail.com>
2024-07-29 12:49         ` Matthew Wilcox
2024-07-29 13:11           ` Barry Song
2024-07-29 15:13             ` Matthew Wilcox
2024-07-29 20:03               ` Barry Song
2024-07-29 21:56                 ` Barry Song
2024-07-30  8:12               ` Ryan Roberts
2024-07-29  6:36     ` Chuanhua Han
2024-07-29 12:55       ` Matthew Wilcox
2024-07-29 13:18         ` Barry Song
2024-07-29 13:32         ` Chuanhua Han
2024-07-29 14:16   ` Dan Carpenter
2024-07-26  9:46 ` [PATCH v5 4/4] mm: Introduce per-thpsize swapin control policy Barry Song
2024-07-27  5:58   ` kernel test robot
2024-07-29  1:37     ` Barry Song
2024-07-29  3:52   ` Matthew Wilcox
2024-07-29  4:49     ` Barry Song
2024-07-29 16:11     ` Christoph Hellwig
2024-07-29 20:11       ` Barry Song
2024-07-30 16:30         ` Christoph Hellwig
2024-07-30 19:28           ` Nhat Pham
2024-07-30 21:06             ` Barry Song
2024-07-31 18:35               ` Nhat Pham
2024-08-01  3:00                 ` Sergey Senozhatsky
2024-08-01 20:55           ` Chris Li
2024-08-12  8:27             ` Christoph Hellwig
2024-08-12  8:44               ` Barry Song
2024-07-30  2:27       ` Chuanhua Han
2024-07-30  8:36     ` Ryan Roberts
2024-07-30  8:47       ` David Hildenbrand
2024-08-05  6:10     ` Huang, Ying
2024-08-02 12:20 ` [PATCH v6 0/2] mm: Ignite large folios swap-in support Barry Song
2024-08-02 12:20   ` [PATCH v6 1/2] mm: add nr argument in mem_cgroup_swapin_uncharge_swap() helper to support large folios Barry Song
2024-08-02 17:29     ` Chris Li
2024-08-02 12:20   ` [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like devices Barry Song
2024-08-03 19:08     ` Andrew Morton
2024-08-12  8:26     ` Christoph Hellwig
2024-08-12  8:53       ` Barry Song
2024-08-12 11:38         ` Christoph Hellwig
2024-08-15  9:47     ` Kairui Song
2024-08-15 13:27       ` Kefeng Wang
2024-08-15 23:06         ` Barry Song [this message]
2024-08-16 16:50           ` Kairui Song
2024-08-16 20:34             ` Andrew Morton
2024-08-27  3:41               ` Chuanhua Han
2024-08-16 21:16           ` Matthew Wilcox
2024-08-16 21:39             ` Barry Song

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:0ed3603aaf3 dfblob:6cba28ef91e )
 OR (
bs:"mm: fallback to next_order if charing mTHP fails" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240815230612.77266-1-21cnbao@gmail.com \
    --to=21cnbao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hanchuanhua@oppo.com \
    --cc=hannes@cmpxchg.org \
    --cc=hch@infradead.org \
    --cc=hughd@google.com \
    --cc=kaleshsingh@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=minchan@kernel.org \
    --cc=nphamcs@gmail.com \
    --cc=ryan.roberts@arm.com \
    --cc=ryncsn@gmail.com \
    --cc=senozhatsky@chromium.org \
    --cc=shakeel.butt@linux.dev \
    --cc=shy828301@gmail.com \
    --cc=surenb@google.com \
    --cc=v-songbaohua@oppo.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=willy@infradead.org \
    --cc=xiang@kernel.org \
    --cc=ying.huang@intel.com \
    --cc=yosryahmed@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).