From: Minchan Kim <minchan@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm <linux-mm@kvack.org>,
LKML <linux-kernel@vger.kernel.org>,
linux-api@vger.kernel.org, Michal Hocko <mhocko@suse.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Tim Murray <timmurray@google.com>,
Joel Fernandes <joel@joelfernandes.org>,
Suren Baghdasaryan <surenb@google.com>,
Daniel Colascione <dancol@google.com>,
Shakeel Butt <shakeelb@google.com>,
Sonny Rao <sonnyrao@google.com>,
Brian Geffon <bgeffon@google.com>,
jannh@google.com, oleg@redhat.com, christian@brauner.io,
oleksandr@redhat.com, hdanton@sina.com, lizeb@google.com,
Minchan Kim <minchan@kernel.org>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Christopher Lameter <cl@linux.com>
Subject: [PATCH v2 5/5] mm: factor out pmd young/dirty bit handling and THP split
Date: Mon, 10 Jun 2019 20:12:52 +0900 [thread overview]
Message-ID: <20190610111252.239156-6-minchan@kernel.org> (raw)
In-Reply-To: <20190610111252.239156-1-minchan@kernel.org>
Now, there are common part among MADV_COLD|PAGEOUT|FREE to reset
access/dirty bit resetting or split the THP page to handle part
of subpages in the THP page. This patch factor out the common part.
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Christopher Lameter <cl@linux.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
include/linux/huge_mm.h | 3 -
mm/huge_memory.c | 74 -------------
mm/madvise.c | 234 +++++++++++++++++++++++-----------------
3 files changed, 135 insertions(+), 176 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..2667e1aa3ce5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -29,9 +29,6 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
-extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long addr, unsigned long next);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f8bce9a6b32..22e20f929463 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1668,80 +1668,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
return 0;
}
-/*
- * Return true if we do MADV_FREE successfully on entire pmd page.
- * Otherwise, return false.
- */
-bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long addr, unsigned long next)
-{
- spinlock_t *ptl;
- pmd_t orig_pmd;
- struct page *page;
- struct mm_struct *mm = tlb->mm;
- bool ret = false;
-
- tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
-
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (!ptl)
- goto out_unlocked;
-
- orig_pmd = *pmd;
- if (is_huge_zero_pmd(orig_pmd))
- goto out;
-
- if (unlikely(!pmd_present(orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- goto out;
- }
-
- page = pmd_page(orig_pmd);
- /*
- * If other processes are mapping this page, we couldn't discard
- * the page unless they all do MADV_FREE so let's skip the page.
- */
- if (page_mapcount(page) != 1)
- goto out;
-
- if (!trylock_page(page))
- goto out;
-
- /*
- * If user want to discard part-pages of THP, split it so MADV_FREE
- * will deactivate only them.
- */
- if (next - addr != HPAGE_PMD_SIZE) {
- get_page(page);
- spin_unlock(ptl);
- split_huge_page(page);
- unlock_page(page);
- put_page(page);
- goto out_unlocked;
- }
-
- if (PageDirty(page))
- ClearPageDirty(page);
- unlock_page(page);
-
- if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- pmdp_invalidate(vma, addr, pmd);
- orig_pmd = pmd_mkold(orig_pmd);
- orig_pmd = pmd_mkclean(orig_pmd);
-
- set_pmd_at(mm, addr, pmd, orig_pmd);
- tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- }
-
- mark_page_lazyfree(page);
- ret = true;
-out:
- spin_unlock(ptl);
-out_unlocked:
- return ret;
-}
-
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
pgtable_t pgtable;
diff --git a/mm/madvise.c b/mm/madvise.c
index 3b9d2ba421b1..bb1906bb75fd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -310,6 +310,91 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+enum madv_pmdp_reset_t {
+ MADV_PMDP_RESET, /* pmd was reset successfully */
+ MADV_PMDP_SPLIT, /* pmd was split */
+ MADV_PMDP_ERROR,
+};
+
+static enum madv_pmdp_reset_t madvise_pmdp_reset_or_split(struct mm_walk *walk,
+ pmd_t *pmd, spinlock_t *ptl,
+ unsigned long addr, unsigned long end,
+ bool young, bool dirty)
+{
+ pmd_t orig_pmd;
+ unsigned long next;
+ struct page *page;
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = walk->mm;
+ struct vm_area_struct *vma = walk->vma;
+ bool reset_young = false;
+ bool reset_dirty = false;
+ enum madv_pmdp_reset_t ret = MADV_PMDP_ERROR;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ return ret;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ return ret;
+ }
+
+ next = pmd_addr_end(addr, end);
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ /*
+ * THP collapsing is not cheap so only split the page is
+ * private to the this process.
+ */
+ if (page_mapcount(page) != 1)
+ return ret;
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ if (!split_huge_page(page))
+ ret = MADV_PMDP_SPLIT;
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+
+ if (young && pmd_young(orig_pmd))
+ reset_young = true;
+ if (dirty && pmd_dirty(orig_pmd))
+ reset_dirty = true;
+
+ /*
+ * Other process could rely on the PG_dirty for data consistency,
+ * not pte_dirty so we could reset PG_dirty only when we are owner
+ * of the page.
+ */
+ if (reset_dirty) {
+ if (page_mapcount(page) != 1)
+ goto out;
+ if (!trylock_page(page))
+ goto out;
+ if (PageDirty(page))
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ ret = MADV_PMDP_RESET;
+ if (reset_young || reset_dirty) {
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmd);
+ if (reset_young)
+ orig_pmd = pmd_mkold(orig_pmd);
+ if (reset_dirty)
+ orig_pmd = pmd_mkclean(orig_pmd);
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+out:
+ return ret;
+}
+
static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -319,64 +404,31 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
struct page *page;
- unsigned long next;
- next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- pmd_t orig_pmd;
-
- tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
- orig_pmd = *pmd;
- if (is_huge_zero_pmd(orig_pmd))
- goto huge_unlock;
-
- if (unlikely(!pmd_present(orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- goto huge_unlock;
- }
-
- page = pmd_page(orig_pmd);
- if (next - addr != HPAGE_PMD_SIZE) {
- int err;
-
- if (page_mapcount(page) != 1)
- goto huge_unlock;
-
- get_page(page);
+ switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+ true, false)) {
+ case MADV_PMDP_RESET:
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (!err)
- goto regular_page;
- return 0;
- }
-
- if (pmd_young(orig_pmd)) {
- pmdp_invalidate(vma, addr, pmd);
- orig_pmd = pmd_mkold(orig_pmd);
-
- set_pmd_at(mm, addr, pmd, orig_pmd);
- tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ page = pmd_page(*pmd);
+ test_and_clear_page_young(page);
+ deactivate_page(page);
+ goto next;
+ case MADV_PMDP_ERROR:
+ spin_unlock(ptl);
+ goto next;
+ case MADV_PMDP_SPLIT:
+ ; /* go through */
}
-
- test_and_clear_page_young(page);
- deactivate_page(page);
-huge_unlock:
- spin_unlock(ptl);
- return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
-regular_page:
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
@@ -414,6 +466,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
arch_enter_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
+next:
cond_resched();
return 0;
@@ -464,70 +517,38 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
LIST_HEAD(page_list);
struct page *page;
int isolated = 0;
- unsigned long next;
if (fatal_signal_pending(current))
return -EINTR;
- next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
- pmd_t orig_pmd;
-
- tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
- orig_pmd = *pmd;
- if (is_huge_zero_pmd(orig_pmd))
- goto huge_unlock;
-
- if (unlikely(!pmd_present(orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- goto huge_unlock;
- }
-
- page = pmd_page(orig_pmd);
- if (next - addr != HPAGE_PMD_SIZE) {
- int err;
-
- if (page_mapcount(page) != 1)
- goto huge_unlock;
- get_page(page);
+ switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+ true, false)) {
+ case MADV_PMDP_RESET:
+ page = pmd_page(*pmd);
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (!err)
- goto regular_page;
- return 0;
- }
-
- if (isolate_lru_page(page))
- goto huge_unlock;
-
- if (pmd_young(orig_pmd)) {
- pmdp_invalidate(vma, addr, pmd);
- orig_pmd = pmd_mkold(orig_pmd);
-
- set_pmd_at(mm, addr, pmd, orig_pmd);
- tlb_remove_tlb_entry(tlb, pmd, addr);
+ if (isolate_lru_page(page))
+ return 0;
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ list_add(&page->lru, &page_list);
+ reclaim_pages(&page_list);
+ goto next;
+ case MADV_PMDP_ERROR:
+ spin_unlock(ptl);
+ goto next;
+ case MADV_PMDP_SPLIT:
+ ; /* go through */
}
-
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
- list_add(&page->lru, &page_list);
-huge_unlock:
- spin_unlock(ptl);
- reclaim_pages(&page_list);
- return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
-regular_page:
+
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
@@ -569,6 +590,7 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
reclaim_pages(&page_list);
+next:
cond_resched();
return 0;
@@ -620,12 +642,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte, *pte, ptent;
struct page *page;
int nr_swap = 0;
- unsigned long next;
- next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd))
- if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
+ true, true)) {
+ case MADV_PMDP_RESET:
+ page = pmd_page(*pmd);
+ spin_unlock(ptl);
+ mark_page_lazyfree(page);
goto next;
+ case MADV_PMDP_ERROR:
+ spin_unlock(ptl);
+ goto next;
+ case MADV_PMDP_SPLIT:
+ ; /* go through */
+ }
+ }
if (pmd_trans_unstable(pmd))
return 0;
@@ -737,8 +773,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
- cond_resched();
next:
+ cond_resched();
return 0;
}
--
2.22.0.rc2.383.gf4fbbf30c2-goog
next prev parent reply other threads:[~2019-06-10 11:12 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-06-10 11:12 [PATCH v2 0/5] Introduce MADV_COLD and MADV_PAGEOUT Minchan Kim
2019-06-10 11:12 ` [PATCH v2 1/5] mm: introduce MADV_COLD Minchan Kim
2019-06-19 12:56 ` Michal Hocko
2019-06-20 0:06 ` Minchan Kim
2019-06-20 7:08 ` Michal Hocko
2019-06-20 8:44 ` Minchan Kim
2019-06-10 11:12 ` [PATCH v2 2/5] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM Minchan Kim
2019-06-19 13:09 ` Michal Hocko
2019-06-10 11:12 ` [PATCH v2 3/5] mm: account nr_isolated_xxx in [isolate|putback]_lru_page Minchan Kim
2019-06-10 11:12 ` [PATCH v2 4/5] mm: introduce MADV_PAGEOUT Minchan Kim
2019-06-19 13:24 ` Michal Hocko
2019-06-20 4:16 ` Minchan Kim
2019-06-20 7:04 ` Michal Hocko
2019-06-20 8:40 ` Minchan Kim
2019-06-20 9:22 ` Michal Hocko
2019-06-20 10:32 ` Minchan Kim
2019-06-20 10:55 ` Michal Hocko
2019-06-10 11:12 ` Minchan Kim [this message]
2019-06-10 18:03 ` [PATCH v2 0/5] Introduce MADV_COLD and MADV_PAGEOUT Dave Hansen
2019-06-13 4:51 ` Minchan Kim
2019-06-12 10:59 ` Pavel Machek
2019-06-12 11:19 ` Oleksandr Natalenko
2019-06-12 11:37 ` Pavel Machek
2019-06-19 12:27 ` Michal Hocko
2019-06-19 23:42 ` Minchan Kim
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190610111252.239156-6-minchan@kernel.org \
--to=minchan@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=bgeffon@google.com \
--cc=christian@brauner.io \
--cc=cl@linux.com \
--cc=dancol@google.com \
--cc=hannes@cmpxchg.org \
--cc=hdanton@sina.com \
--cc=jannh@google.com \
--cc=joel@joelfernandes.org \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lizeb@google.com \
--cc=mhocko@suse.com \
--cc=oleg@redhat.com \
--cc=oleksandr@redhat.com \
--cc=shakeelb@google.com \
--cc=sonnyrao@google.com \
--cc=surenb@google.com \
--cc=timmurray@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.