From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linux-mm@kvack.org, akpm@linux-foundation.org
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>,
Linus Torvalds <torvalds@linux-foundation.org>,
npiggin@gmail.com, kaleshsingh@google.com,
joel@joelfernandes.org,
"Kirill A . Shutemov" <kirill@shutemov.name>,
linuxppc-dev@lists.ozlabs.org
Subject: [RFC PATCH 1/2] mm/mremap: Fix race between MOVE_PMD mremap and pageout
Date: Tue, 25 May 2021 14:03:43 +0530 [thread overview]
Message-ID: <20210525083344.162377-1-aneesh.kumar@linux.ibm.com> (raw)
CPU 1 CPU 2 CPU 3
mremap(old_addr, new_addr) page_shrinker/try_to_unmap_one
mmap_write_lock_killable()
addr = old_addr
lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)
*new_pmd = pmd
*new_addr = 10; and fills
TLB with new addr
and old pfn
unlock(pmd_ptl)
ptep_clear_flush()
old pfn is free.
Stale TLB entry
Fix this race by holding pmd lock in pageout. This still doesn't handle the race
between MOVE_PUD and pageout.
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
include/linux/rmap.h | 9 ++++++---
mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
2 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..272ab0c2b60b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
unsigned long address;
pmd_t *pmd;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *pte_ptl;
+ spinlock_t *pmd_ptl;
unsigned int flags;
};
@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
if (pvmw->pte && !PageHuge(pvmw->page))
pte_unmap(pvmw->pte);
- if (pvmw->ptl)
- spin_unlock(pvmw->ptl);
+ if (pvmw->pte_ptl)
+ spin_unlock(pvmw->pte_ptl);
+ if (pvmw->pmd_ptl)
+ spin_unlock(pvmw->pmd_ptl);
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..87a2c94c7e27 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return false;
}
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS) {
+ pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
+ }
return true;
}
@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
- spin_lock(pvmw->ptl);
+ pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ spin_lock(pvmw->pte_ptl);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pud_present(*pud))
return false;
pvmw->pmd = pmd_offset(pud, pvmw->address);
+ pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
/*
* Make sure the pmd value isn't cached in a register by the
* compiler and used as a stale value after we've observed a
@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
*/
pmde = READ_ONCE(*pvmw->pmd);
if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
if (likely(pmd_trans_huge(*pvmw->pmd))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
}
}
return not_found(pvmw);
- } else {
- /* THP pmd was split under us: handle on pte level */
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
+ } else if (!pmd_present(pmde))
+ return not_found(pvmw);
+
if (!map_pte(pvmw))
goto next_pte;
while (1) {
@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
pte_unmap(pvmw->pte);
- if (pvmw->ptl) {
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
+ if (pvmw->pte_ptl) {
+ spin_unlock(pvmw->pte_ptl);
+ pvmw->pte_ptl = NULL;
}
+ spin_unlock(pvmw->pmd_ptl);
+ pvmw->pmd_ptl = NULL;
goto restart;
} else {
pvmw->pte++;
}
} while (pte_none(*pvmw->pte));
- if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
+ pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
}
}
}
--
2.31.1
WARNING: multiple messages have this Message-ID (diff)
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
To: linux-mm@kvack.org, akpm@linux-foundation.org
Cc: mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
kaleshsingh@google.com, npiggin@gmail.com,
joel@joelfernandes.org,
Christophe Leroy <christophe.leroy@csgroup.eu>,
Linus Torvalds <torvalds@linux-foundation.org>,
"Kirill A . Shutemov" <kirill@shutemov.name>,
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Subject: [RFC PATCH 1/2] mm/mremap: Fix race between MOVE_PMD mremap and pageout
Date: Tue, 25 May 2021 14:03:43 +0530 [thread overview]
Message-ID: <20210525083344.162377-1-aneesh.kumar@linux.ibm.com> (raw)
CPU 1 CPU 2 CPU 3
mremap(old_addr, new_addr) page_shrinker/try_to_unmap_one
mmap_write_lock_killable()
addr = old_addr
lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)
*new_pmd = pmd
*new_addr = 10; and fills
TLB with new addr
and old pfn
unlock(pmd_ptl)
ptep_clear_flush()
old pfn is free.
Stale TLB entry
Fix this race by holding pmd lock in pageout. This still doesn't handle the race
between MOVE_PUD and pageout.
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
include/linux/rmap.h | 9 ++++++---
mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
2 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..272ab0c2b60b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
unsigned long address;
pmd_t *pmd;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *pte_ptl;
+ spinlock_t *pmd_ptl;
unsigned int flags;
};
@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
if (pvmw->pte && !PageHuge(pvmw->page))
pte_unmap(pvmw->pte);
- if (pvmw->ptl)
- spin_unlock(pvmw->ptl);
+ if (pvmw->pte_ptl)
+ spin_unlock(pvmw->pte_ptl);
+ if (pvmw->pmd_ptl)
+ spin_unlock(pvmw->pmd_ptl);
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..87a2c94c7e27 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return false;
}
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS) {
+ pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
+ }
return true;
}
@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
- spin_lock(pvmw->ptl);
+ pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ spin_lock(pvmw->pte_ptl);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pud_present(*pud))
return false;
pvmw->pmd = pmd_offset(pud, pvmw->address);
+ pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
/*
* Make sure the pmd value isn't cached in a register by the
* compiler and used as a stale value after we've observed a
@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
*/
pmde = READ_ONCE(*pvmw->pmd);
if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
if (likely(pmd_trans_huge(*pvmw->pmd))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
}
}
return not_found(pvmw);
- } else {
- /* THP pmd was split under us: handle on pte level */
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
+ } else if (!pmd_present(pmde))
+ return not_found(pvmw);
+
if (!map_pte(pvmw))
goto next_pte;
while (1) {
@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
pte_unmap(pvmw->pte);
- if (pvmw->ptl) {
- spin_unlock(pvmw->ptl);
- pvmw->ptl = NULL;
+ if (pvmw->pte_ptl) {
+ spin_unlock(pvmw->pte_ptl);
+ pvmw->pte_ptl = NULL;
}
+ spin_unlock(pvmw->pmd_ptl);
+ pvmw->pmd_ptl = NULL;
goto restart;
} else {
pvmw->pte++;
}
} while (pte_none(*pvmw->pte));
- if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
+ if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
+ pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->pte_ptl);
}
}
}
--
2.31.1
next reply other threads:[~2021-05-25 8:35 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-25 8:33 Aneesh Kumar K.V [this message]
2021-05-25 8:33 ` [RFC PATCH 1/2] mm/mremap: Fix race between MOVE_PMD mremap and pageout Aneesh Kumar K.V
2021-05-25 8:33 ` [RFC PATCH 2/2] mm/mremap: Fix race between MOVE_PUD " Aneesh Kumar K.V
2021-05-25 8:33 ` Aneesh Kumar K.V
2021-05-25 17:28 ` Linus Torvalds
2021-05-25 17:28 ` Linus Torvalds
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210525083344.162377-1-aneesh.kumar@linux.ibm.com \
--to=aneesh.kumar@linux.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=joel@joelfernandes.org \
--cc=kaleshsingh@google.com \
--cc=kirill@shutemov.name \
--cc=linux-mm@kvack.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=npiggin@gmail.com \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.