From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Alex Thorlton <athorlton@sgi.com>, Ingo Molnar <mingo@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>,
Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>,
"Paul E . McKenney" <paulmck@linux.vnet.ibm.com>,
Al Viro <viro@zeniv.linux.org.uk>,
Andi Kleen <ak@linux.intel.com>,
Andrea Arcangeli <aarcange@redhat.com>,
Dave Hansen <dave.hansen@intel.com>,
Dave Jones <davej@redhat.com>,
David Howells <dhowells@redhat.com>,
Frederic Weisbecker <fweisbec@gmail.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Kees Cook <keescook@chromium.org>, Mel Gorman <mgorman@suse.de>,
Michael Kerrisk <mtk.manpages@gmail.com>,
Oleg Nesterov <oleg@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Rik van Riel <riel@redhat.com>, Robin Holt <robinmholt@gmail.com>,
Sedat Dilek <sedat.dilek@gmail.com>,
Srikar Dronamraju <srikar@linux.vnet.ibm.com>,
Thomas Gleixner <tglx@linutronix.de>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv2 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock
Date: Mon, 16 Sep 2013 14:25:35 +0300 [thread overview]
Message-ID: <1379330740-5602-5-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1379330740-5602-1-git-send-email-kirill.shutemov@linux.intel.com>
With split ptlock it's important to know which lock pmd_trans_huge_lock()
took. This patch adds one more parameter to the function to return the
lock.
In most places migration to new api is trivial.
Exception is move_huge_pmd(): we need to take two locks if pmd tables
are different.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
fs/proc/task_mmu.c | 13 +++++++------
include/linux/huge_mm.h | 14 +++++++-------
mm/huge_memory.c | 40 +++++++++++++++++++++++++++-------------
mm/memcontrol.c | 10 +++++-----
4 files changed, 46 insertions(+), 31 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8e124ac..4599519 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -993,13 +993,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
struct vm_area_struct *vma;
struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1017,7 +1018,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}
@@ -1319,7 +1320,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
md = walk->private;
- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
@@ -1327,7 +1328,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3935428..4aca0d8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma);
+extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl);
/* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
- return __pmd_trans_huge_lock(pmd, vma);
+ return __pmd_trans_huge_lock(pmd, vma, ptl);
else
return 0;
}
@@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bbd41a2..59a1340 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1342,9 +1342,10 @@ out_unlock:
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1359,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
atomic_dec(&tlb->mm->nr_ptes);
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1368,7 +1369,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
atomic_dec(&tlb->mm->nr_ptes);
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1381,14 +1382,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec)
{
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1401,6 +1403,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
+ spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
@@ -1421,12 +1424,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
if (ret == 1) {
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
}
out:
return ret;
@@ -1436,9 +1448,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
@@ -1454,7 +1467,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
}
set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
ret = 1;
}
@@ -1468,12 +1481,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1482,7 +1496,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce..5f35b2a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6376,10 +6376,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6568,9 +6568,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +6587,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
--
1.8.4.rc3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2013-09-16 11:25 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-09-16 11:25 [PATCHv2 0/9] split page table lock for PMD tables Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 1/9] mm: rename USE_SPLIT_PTLOCKS to USE_SPLIT_PTE_PTLOCKS Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov
2013-09-17 14:33 ` Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
2013-09-16 11:25 ` Kirill A. Shutemov [this message]
2013-09-16 11:25 ` [PATCHv2 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
2013-09-17 14:28 ` Kirill A. Shutemov
2013-09-16 11:25 ` [PATCHv2 9/9] x86, mm: enable " Kirill A. Shutemov
2013-09-16 11:44 ` [PATCHv2 0/9] split page table lock for PMD tables Peter Zijlstra
2013-09-16 12:11 ` Kirill A. Shutemov
2013-09-19 17:17 ` Alex Thorlton
2013-09-20 12:31 ` Kirill A. Shutemov
2013-09-24 16:44 ` Alex Thorlton
2013-09-26 10:50 ` Kirill A. Shutemov
2013-09-26 15:42 ` Kirill A. Shutemov
2013-09-27 0:04 ` Naoya Horiguchi
2013-09-26 21:19 ` Alex Thorlton
2013-09-26 21:38 ` Kirill A. Shutemov
2013-09-26 21:42 ` Kirill A. Shutemov
2013-09-26 21:44 ` Alex Thorlton
2013-09-26 21:43 ` Alex Thorlton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1379330740-5602-5-git-send-email-kirill.shutemov@linux.intel.com \
--to=kirill.shutemov@linux.intel.com \
--cc=aarcange@redhat.com \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=athorlton@sgi.com \
--cc=dave.hansen@intel.com \
--cc=davej@redhat.com \
--cc=dhowells@redhat.com \
--cc=ebiederm@xmission.com \
--cc=fweisbec@gmail.com \
--cc=hannes@cmpxchg.org \
--cc=keescook@chromium.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=mtk.manpages@gmail.com \
--cc=n-horiguchi@ah.jp.nec.com \
--cc=oleg@redhat.com \
--cc=paulmck@linux.vnet.ibm.com \
--cc=peterz@infradead.org \
--cc=riel@redhat.com \
--cc=robinmholt@gmail.com \
--cc=sedat.dilek@gmail.com \
--cc=srikar@linux.vnet.ibm.com \
--cc=tglx@linutronix.de \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).