* [PATCH 2/2] thp: support split page table lock
2013-08-30 17:18 [PATCH 0/2 v2] split page table lock for hugepage Naoya Horiguchi
@ 2013-08-30 17:18 ` Naoya Horiguchi
2013-09-02 10:53 ` Kirill A. Shutemov
2013-09-03 20:52 ` Naoya Horiguchi
0 siblings, 2 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-08-30 17:18 UTC (permalink / raw)
To: linux-mm
Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, Alex Thorlton, linux-kernel
Thp related code also uses per process mm->page_table_lock now. So making
it fine-grained can provide better performance.
This patch makes thp support split page table lock which makes us use
page->ptl of the pages storing "pmd_trans_huge" pmds.
Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
are expected by their caller to pass back the pointer of ptl, so this
patch adds to those functions the arguments for that. Rather than that,
this patch gives only straightforward replacement.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
arch/powerpc/mm/pgtable_64.c | 8 +-
arch/s390/mm/pgtable.c | 4 +-
arch/sparc/mm/tlb.c | 4 +-
fs/proc/task_mmu.c | 17 +++--
include/linux/huge_mm.h | 11 +--
include/linux/mm.h | 3 +
mm/huge_memory.c | 170 ++++++++++++++++++++++++++-----------------
mm/memcontrol.c | 14 ++--
mm/memory.c | 15 ++--
mm/migrate.c | 8 +-
mm/mprotect.c | 5 +-
mm/pgtable-generic.c | 10 +--
mm/rmap.c | 11 ++-
13 files changed, 161 insertions(+), 119 deletions(-)
diff --git v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c v3.11-rc3/arch/powerpc/mm/pgtable_64.c
index 536eec72..f9177eb 100644
--- v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c
+++ v3.11-rc3/arch/powerpc/mm/pgtable_64.c
@@ -605,7 +605,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(vma->vm_mm, pmdp));
#endif
#ifdef PTE_ATOMIC_UPDATES
@@ -643,7 +643,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/*
* we store the pgtable in the second half of PMD
*/
@@ -663,7 +663,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
@@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_none(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
WARN_ON(!pmd_trans_huge(pmd));
#endif
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
diff --git v3.11-rc3.orig/arch/s390/mm/pgtable.c v3.11-rc3/arch/s390/mm/pgtable.c
index a8154a1..d6c6b5c 100644
--- v3.11-rc3.orig/arch/s390/mm/pgtable.c
+++ v3.11-rc3/arch/s390/mm/pgtable.c
@@ -1170,7 +1170,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
if (!mm->pmd_huge_pte)
@@ -1186,7 +1186,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pte_t *ptep;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/arch/sparc/mm/tlb.c v3.11-rc3/arch/sparc/mm/tlb.c
index 7a91f28..cca3bed 100644
--- v3.11-rc3.orig/arch/sparc/mm/tlb.c
+++ v3.11-rc3/arch/sparc/mm/tlb.c
@@ -193,7 +193,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
if (!mm->pmd_huge_pte)
@@ -208,7 +208,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
struct list_head *lh;
pgtable_t pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/fs/proc/task_mmu.c v3.11-rc3/fs/proc/task_mmu.c
index dbf61f6..e23c882 100644
--- v3.11-rc3.orig/fs/proc/task_mmu.c
+++ v3.11-rc3/fs/proc/task_mmu.c
@@ -503,11 +503,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -980,10 +980,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ spinlock_t *uninitialized_var(ptl);
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
@@ -997,7 +998,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}
@@ -1276,13 +1277,13 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
pte_t *orig_pte;
pte_t *pte;
md = walk->private;
- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
@@ -1290,7 +1291,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
diff --git v3.11-rc3.orig/include/linux/huge_mm.h v3.11-rc3/include/linux/huge_mm.h
index b60de92..1faf757 100644
--- v3.11-rc3.orig/include/linux/huge_mm.h
+++ v3.11-rc3/include/linux/huge_mm.h
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
extern pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag);
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -133,14 +134,14 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long end,
long adjust_next);
extern int __pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma);
+ struct vm_area_struct *vma, spinlock_t **ptl);
/* mmap_sem must be held on entry */
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
- return __pmd_trans_huge_lock(pmd, vma);
+ return __pmd_trans_huge_lock(pmd, vma, ptl);
else
return 0;
}
@@ -219,7 +220,7 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
{
}
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
return 0;
}
diff --git v3.11-rc3.orig/include/linux/mm.h v3.11-rc3/include/linux/mm.h
index f022460..9219f43 100644
--- v3.11-rc3.orig/include/linux/mm.h
+++ v3.11-rc3/include/linux/mm.h
@@ -1251,6 +1251,8 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define huge_pmd_lockptr(mm, pmdp) \
+ ({(void)(mm); __pte_lockptr(virt_to_page(pmdp)); })
#else /* !USE_SPLIT_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
@@ -1258,6 +1260,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
+#define huge_pmd_lockptr(mm, pmdp) ({(void)(pmd); &(mm)->page_table_lock; })
#endif /* USE_SPLIT_PTLOCKS */
static inline void pgtable_page_ctor(struct page *page)
diff --git v3.11-rc3.orig/mm/huge_memory.c v3.11-rc3/mm/huge_memory.c
index 243e710..3cb29e1 100644
--- v3.11-rc3.orig/mm/huge_memory.c
+++ v3.11-rc3/mm/huge_memory.c
@@ -705,6 +705,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
+ spinlock_t *ptl;
VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -719,9 +720,10 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
@@ -733,7 +735,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
return 0;
@@ -761,6 +763,7 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif
+/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -804,10 +807,11 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
@@ -864,14 +868,17 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t pmd;
pgtable_t pgtable;
int ret;
+ spinlock_t *uninitialized_var(dst_ptl), *uninitialized_var(src_ptl);
ret = -ENOMEM;
pgtable = pte_alloc_one(dst_mm, addr);
if (unlikely(!pgtable))
goto out;
- spin_lock(&dst_mm->page_table_lock);
- spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = huge_pmd_lockptr(dst_mm, dst_ptl);
+ src_ptl = huge_pmd_lockptr(src_mm, src_ptl);
+ spin_lock(dst_ptl);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pmd = *src_pmd;
@@ -880,7 +887,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * When page table lock is held, the huge zero pmd should not be
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -901,8 +908,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
pte_free(dst_mm, pgtable);
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -922,8 +929,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = 0;
out_unlock:
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
out:
return ret;
}
@@ -936,8 +943,9 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
{
pmd_t entry;
unsigned long haddr;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
@@ -947,7 +955,7 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);
unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
@@ -960,6 +968,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
int i, ret = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
@@ -980,7 +989,8 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;
@@ -1007,7 +1017,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);
@@ -1017,7 +1027,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1037,6 +1047,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
@@ -1077,7 +1088,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1103,7 +1115,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1114,7 +1126,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
return ret;
out_free_pages:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1134,12 +1146,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
@@ -1155,7 +1168,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1200,11 +1213,11 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
@@ -1225,13 +1238,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return ret;
}
@@ -1240,11 +1253,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd_t *pmd,
unsigned int flags)
{
- struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
- assert_spin_locked(&mm->page_table_lock);
-
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
@@ -1295,8 +1305,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
int target_nid;
int current_nid = -1;
bool migrated;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1314,17 +1325,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Acquire the page lock to serialise THP migrations */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
lock_page(page);
/* Confirm the PTE did not while locked */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1336,7 +1347,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
check_same:
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
clear_pmdnuma:
@@ -1345,7 +1356,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (current_nid != -1)
task_numa_fault(current_nid, HPAGE_PMD_NR, false);
return 0;
@@ -1355,8 +1366,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1371,7 +1383,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1380,7 +1392,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1394,13 +1406,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned char *vec)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1415,6 +1428,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
{
int ret = 0;
pmd_t pmd;
+ spinlock_t *uninitialized_var(ptl);
struct mm_struct *mm = vma->vm_mm;
@@ -1433,12 +1447,12 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &ptl);
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
out:
return ret;
@@ -1449,8 +1463,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
@@ -1466,7 +1481,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
}
set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
ret = 1;
}
@@ -1480,12 +1495,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = huge_pmd_lockptr(vma->vm_mm, pmd);
+ spin_lock(*ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1494,14 +1511,23 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag)
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
{
pmd_t *pmd, *ret = NULL;
@@ -1511,10 +1537,12 @@ pmd_t *page_check_address_pmd(struct page *page,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
+ *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(*ptl);
if (pmd_none(*pmd))
- goto out;
+ goto unlock;
if (pmd_page(*pmd) != page)
- goto out;
+ goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1524,12 +1552,15 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto out;
+ goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
ret = pmd;
+ goto out;
}
+unlock:
+ spin_unlock(*ptl);
out:
return ret;
}
@@ -1541,14 +1572,15 @@ static int __split_huge_page_splitting(struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
/* For mmu_notifiers */
const unsigned long mmun_start = address;
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1559,8 +1591,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
@@ -1694,10 +1726,10 @@ static int __split_huge_page_map(struct page *page,
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
+ spinlock_t *uninitialized_var(ptl);
- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1752,8 +1784,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -2314,7 +2346,7 @@ static void collapse_huge_page(struct mm_struct *mm,
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ spin_lock(ptl); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2322,7 +2354,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_lock(ptl);
@@ -2331,7 +2363,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2339,7 +2371,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2364,13 +2396,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*hpage = NULL;
@@ -2698,6 +2730,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
+ spinlock_t *ptl;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2706,22 +2739,23 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
diff --git v3.11-rc3.orig/mm/memcontrol.c v3.11-rc3/mm/memcontrol.c
index 00a7a66..3949444 100644
--- v3.11-rc3.orig/mm/memcontrol.c
+++ v3.11-rc3/mm/memcontrol.c
@@ -6591,12 +6591,12 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6769,7 +6769,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
enum mc_target_type target_type;
union mc_target target;
struct page *page;
@@ -6785,9 +6785,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6804,7 +6804,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
diff --git v3.11-rc3.orig/mm/memory.c v3.11-rc3/mm/memory.c
index 7ec1252..6827a35 100644
--- v3.11-rc3.orig/mm/memory.c
+++ v3.11-rc3/mm/memory.c
@@ -1531,20 +1531,21 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* fall through */
}
split_fallthrough:
@@ -3609,17 +3610,17 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte, *orig_pte;
unsigned long _addr = addr & PMD_MASK;
unsigned long offset;
- spinlock_t *ptl;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);
bool numa = false;
int local_nid = numa_node_id();
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
pmd = *pmdp;
if (pmd_numa(pmd)) {
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
numa = true;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!numa)
return 0;
diff --git v3.11-rc3.orig/mm/migrate.c v3.11-rc3/mm/migrate.c
index c69a9c7..1e1e9f2 100644
--- v3.11-rc3.orig/mm/migrate.c
+++ v3.11-rc3/mm/migrate.c
@@ -1659,6 +1659,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
+ spinlock_t *ptl;
/*
* Don't migrate pages that are mapped in multiple processes.
@@ -1699,9 +1700,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, entry))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1746,7 +1748,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
unlock_page(new_page);
unlock_page(page);
diff --git v3.11-rc3.orig/mm/mprotect.c v3.11-rc3/mm/mprotect.c
index 94722a4..c65c390 100644
--- v3.11-rc3.orig/mm/mprotect.c
+++ v3.11-rc3/mm/mprotect.c
@@ -116,9 +116,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#else
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
diff --git v3.11-rc3.orig/mm/pgtable-generic.c v3.11-rc3/mm/pgtable-generic.c
index e1a6e4f..8e49928 100644
--- v3.11-rc3.orig/mm/pgtable-generic.c
+++ v3.11-rc3/mm/pgtable-generic.c
@@ -124,11 +124,10 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* The caller must hold page table lock */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
@@ -141,13 +140,14 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* no "address" argument so destroys page coloring of some arch */
+/*
+ * no "address" argument so destroys page coloring of some arch
+ * The caller must hold page table lock.
+ */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;
- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
diff --git v3.11-rc3.orig/mm/rmap.c v3.11-rc3/mm/rmap.c
index eccec58..798f6ae 100644
--- v3.11-rc3.orig/mm/rmap.c
+++ v3.11-rc3/mm/rmap.c
@@ -666,24 +666,24 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
int referenced = 0;
+ spinlock_t *uninitialized_var(ptl);
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
- spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
goto out;
}
if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
goto out;
@@ -692,10 +692,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
} else {
pte_t *pte;
- spinlock_t *ptl;
/*
* rmap might return false positives; we must filter
--
1.8.3.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
* RE: [PATCH 2/2] thp: support split page table lock
2013-08-30 17:18 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
@ 2013-09-02 10:53 ` Kirill A. Shutemov
2013-09-02 16:37 ` Naoya Horiguchi
2013-09-03 20:52 ` Naoya Horiguchi
1 sibling, 1 reply; 14+ messages in thread
From: Kirill A. Shutemov @ 2013-09-02 10:53 UTC (permalink / raw)
To: Naoya Horiguchi
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, Alex Thorlton, linux-kernel
Naoya Horiguchi wrote:
> Thp related code also uses per process mm->page_table_lock now. So making
> it fine-grained can provide better performance.
>
> This patch makes thp support split page table lock which makes us use
> page->ptl of the pages storing "pmd_trans_huge" pmds.
Hm. So, you use page->ptl only when you deal with thp pages, otherwise
mm->page_table_lock, right?
It looks inconsistent to me. Does it mean we have to take both locks on
split and collapse paths? I'm not sure if it's safe to take only
page->ptl for alloc path. Probably not.
Why not to use new locking for pmd everywhere?
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-02 10:53 ` Kirill A. Shutemov
@ 2013-09-02 16:37 ` Naoya Horiguchi
0 siblings, 0 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-02 16:37 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, Aneesh Kumar K.V,
Alex Thorlton, linux-kernel
Kirill, thank you for the comment.
On Mon, Sep 02, 2013 at 01:53:27PM +0300, Kirill A. Shutemov wrote:
> Naoya Horiguchi wrote:
> > Thp related code also uses per process mm->page_table_lock now. So making
> > it fine-grained can provide better performance.
> >
> > This patch makes thp support split page table lock which makes us use
> > page->ptl of the pages storing "pmd_trans_huge" pmds.
>
> Hm. So, you use page->ptl only when you deal with thp pages, otherwise
> mm->page_table_lock, right?
Maybe it's not enough.
We use page->ptl for both of thp and normal depending on USE_SPLIT_PTLOCKS.
And regardless of USE_SPLIT_PTLOCKS, mm->page_table_lock is still used
by other contexts like memory initialization code or driver code for their
specific usage.
> It looks inconsistent to me. Does it mean we have to take both locks on
> split and collapse paths?
This patch includes the replacement with page->ptl for split/collapse path.
> I'm not sure if it's safe to take only
> page->ptl for alloc path. Probably not.
Right, it's not safe.
> Why not to use new locking for pmd everywhere?
So I already do this.
Thanks,
Naoya Horiguchi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-08-30 17:18 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
2013-09-02 10:53 ` Kirill A. Shutemov
@ 2013-09-03 20:52 ` Naoya Horiguchi
1 sibling, 0 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-03 20:52 UTC (permalink / raw)
To: linux-mm
Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, Alex Thorlton, linux-kernel
> diff --git v3.11-rc3.orig/mm/huge_memory.c v3.11-rc3/mm/huge_memory.c
> index 243e710..3cb29e1 100644
> --- v3.11-rc3.orig/mm/huge_memory.c
> +++ v3.11-rc3/mm/huge_memory.c
...
> @@ -864,14 +868,17 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> pmd_t pmd;
> pgtable_t pgtable;
> int ret;
> + spinlock_t *uninitialized_var(dst_ptl), *uninitialized_var(src_ptl);
>
> ret = -ENOMEM;
> pgtable = pte_alloc_one(dst_mm, addr);
> if (unlikely(!pgtable))
> goto out;
>
> - spin_lock(&dst_mm->page_table_lock);
> - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
> + dst_ptl = huge_pmd_lockptr(dst_mm, dst_ptl);
> + src_ptl = huge_pmd_lockptr(src_mm, src_ptl);
I found one mistake. This should be:
+ dst_ptl = huge_pmd_lockptr(dst_mm, dst_pmd);
+ src_ptl = huge_pmd_lockptr(src_mm, src_pmd);
Thanks,
Naoya Horiguchi
> + spin_lock(dst_ptl);
> + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>
> ret = -EAGAIN;
> pmd = *src_pmd;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 2/2] thp: support split page table lock
2013-09-05 21:27 [PATCH 0/2 v3] split page table lock for hugepage Naoya Horiguchi
@ 2013-09-05 21:27 ` Naoya Horiguchi
2013-09-06 10:48 ` Kirill A. Shutemov
2013-09-06 16:04 ` Alex Thorlton
0 siblings, 2 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-05 21:27 UTC (permalink / raw)
To: linux-mm
Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, Alex Thorlton, linux-kernel
Thp related code also uses per process mm->page_table_lock now.
So making it fine-grained can provide better performance.
This patch makes thp support split page table lock by using page->ptl
of the pages storing "pmd_trans_huge" pmds.
Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
are expected by their caller to pass back the pointer of ptl, so this
patch adds to those functions new arguments for that. Rather than that,
this patch gives only straightforward replacement.
ChangeLog v3:
- fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
- added missing declaration of ptl in do_huge_pmd_anonymous_page()
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
arch/powerpc/mm/pgtable_64.c | 8 +-
arch/s390/mm/pgtable.c | 4 +-
arch/sparc/mm/tlb.c | 4 +-
fs/proc/task_mmu.c | 17 +++--
include/linux/huge_mm.h | 11 +--
include/linux/mm.h | 3 +
mm/huge_memory.c | 171 ++++++++++++++++++++++++++-----------------
mm/memcontrol.c | 14 ++--
mm/memory.c | 15 ++--
mm/migrate.c | 8 +-
mm/mprotect.c | 5 +-
mm/pgtable-generic.c | 10 +--
mm/rmap.c | 11 ++-
13 files changed, 162 insertions(+), 119 deletions(-)
diff --git v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c v3.11-rc3/arch/powerpc/mm/pgtable_64.c
index 536eec72..f9177eb 100644
--- v3.11-rc3.orig/arch/powerpc/mm/pgtable_64.c
+++ v3.11-rc3/arch/powerpc/mm/pgtable_64.c
@@ -605,7 +605,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(vma->vm_mm, pmdp));
#endif
#ifdef PTE_ATOMIC_UPDATES
@@ -643,7 +643,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/*
* we store the pgtable in the second half of PMD
*/
@@ -663,7 +663,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
@@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_none(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
WARN_ON(!pmd_trans_huge(pmd));
#endif
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
diff --git v3.11-rc3.orig/arch/s390/mm/pgtable.c v3.11-rc3/arch/s390/mm/pgtable.c
index a8154a1..d6c6b5c 100644
--- v3.11-rc3.orig/arch/s390/mm/pgtable.c
+++ v3.11-rc3/arch/s390/mm/pgtable.c
@@ -1170,7 +1170,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
if (!mm->pmd_huge_pte)
@@ -1186,7 +1186,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pte_t *ptep;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/arch/sparc/mm/tlb.c v3.11-rc3/arch/sparc/mm/tlb.c
index 7a91f28..cca3bed 100644
--- v3.11-rc3.orig/arch/sparc/mm/tlb.c
+++ v3.11-rc3/arch/sparc/mm/tlb.c
@@ -193,7 +193,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
{
struct list_head *lh = (struct list_head *) pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
if (!mm->pmd_huge_pte)
@@ -208,7 +208,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
struct list_head *lh;
pgtable_t pgtable;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
/* FIFO */
pgtable = mm->pmd_huge_pte;
diff --git v3.11-rc3.orig/fs/proc/task_mmu.c v3.11-rc3/fs/proc/task_mmu.c
index dbf61f6..e23c882 100644
--- v3.11-rc3.orig/fs/proc/task_mmu.c
+++ v3.11-rc3/fs/proc/task_mmu.c
@@ -503,11 +503,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -980,10 +980,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ spinlock_t *uninitialized_var(ptl);
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
@@ -997,7 +998,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}
@@ -1276,13 +1277,13 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
pte_t *orig_pte;
pte_t *pte;
md = walk->private;
- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
@@ -1290,7 +1291,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
diff --git v3.11-rc3.orig/include/linux/huge_mm.h v3.11-rc3/include/linux/huge_mm.h
index b60de92..1faf757 100644
--- v3.11-rc3.orig/include/linux/huge_mm.h
+++ v3.11-rc3/include/linux/huge_mm.h
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
extern pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag);
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -133,14 +134,14 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long end,
long adjust_next);
extern int __pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma);
+ struct vm_area_struct *vma, spinlock_t **ptl);
/* mmap_sem must be held on entry */
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
- return __pmd_trans_huge_lock(pmd, vma);
+ return __pmd_trans_huge_lock(pmd, vma, ptl);
else
return 0;
}
@@ -219,7 +220,7 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
{
}
static inline int pmd_trans_huge_lock(pmd_t *pmd,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, spinlock_t **ptl)
{
return 0;
}
diff --git v3.11-rc3.orig/include/linux/mm.h v3.11-rc3/include/linux/mm.h
index f022460..9219f43 100644
--- v3.11-rc3.orig/include/linux/mm.h
+++ v3.11-rc3/include/linux/mm.h
@@ -1251,6 +1251,8 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define huge_pmd_lockptr(mm, pmdp) \
+ ({(void)(mm); __pte_lockptr(virt_to_page(pmdp)); })
#else /* !USE_SPLIT_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
@@ -1258,6 +1260,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
+#define huge_pmd_lockptr(mm, pmdp) ({(void)(pmd); &(mm)->page_table_lock; })
#endif /* USE_SPLIT_PTLOCKS */
static inline void pgtable_page_ctor(struct page *page)
diff --git v3.11-rc3.orig/mm/huge_memory.c v3.11-rc3/mm/huge_memory.c
index 243e710..20fd1dd 100644
--- v3.11-rc3.orig/mm/huge_memory.c
+++ v3.11-rc3/mm/huge_memory.c
@@ -705,6 +705,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
+ spinlock_t *ptl;
VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -719,9 +720,10 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
@@ -733,7 +735,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
return 0;
@@ -761,6 +763,7 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif
+/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -795,6 +798,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ spinlock_t *ptl;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -804,10 +808,11 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
@@ -864,14 +869,17 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t pmd;
pgtable_t pgtable;
int ret;
+ spinlock_t *uninitialized_var(dst_ptl), *uninitialized_var(src_ptl);
ret = -ENOMEM;
pgtable = pte_alloc_one(dst_mm, addr);
if (unlikely(!pgtable))
goto out;
- spin_lock(&dst_mm->page_table_lock);
- spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+ dst_ptl = huge_pmd_lockptr(dst_mm, dst_pmd);
+ src_ptl = huge_pmd_lockptr(src_mm, src_pmd);
+ spin_lock(dst_ptl);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pmd = *src_pmd;
@@ -880,7 +888,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * When page table lock is held, the huge zero pmd should not be
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -901,8 +909,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
pte_free(dst_mm, pgtable);
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -922,8 +930,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = 0;
out_unlock:
- spin_unlock(&src_mm->page_table_lock);
- spin_unlock(&dst_mm->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
out:
return ret;
}
@@ -936,8 +944,9 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
{
pmd_t entry;
unsigned long haddr;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
@@ -947,7 +956,7 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);
unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
@@ -960,6 +969,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
int i, ret = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
@@ -980,7 +990,8 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;
@@ -1007,7 +1018,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);
@@ -1017,7 +1028,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1037,6 +1048,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl;
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
@@ -1077,7 +1089,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1103,7 +1116,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1114,7 +1127,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
return ret;
out_free_pages:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1134,12 +1147,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
@@ -1155,7 +1169,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1200,11 +1214,11 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
@@ -1225,13 +1239,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return ret;
}
@@ -1240,11 +1254,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd_t *pmd,
unsigned int flags)
{
- struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
- assert_spin_locked(&mm->page_table_lock);
-
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
@@ -1295,8 +1306,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
int target_nid;
int current_nid = -1;
bool migrated;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1314,17 +1326,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Acquire the page lock to serialise THP migrations */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
lock_page(page);
/* Confirm the PTE did not while locked */
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1336,7 +1348,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
check_same:
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
clear_pmdnuma:
@@ -1345,7 +1357,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (current_nid != -1)
task_numa_fault(current_nid, HPAGE_PMD_NR, false);
return 0;
@@ -1355,8 +1367,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1371,7 +1384,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1380,7 +1393,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
+ spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1394,13 +1407,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned char *vec)
{
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1415,6 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
{
int ret = 0;
pmd_t pmd;
+ spinlock_t *uninitialized_var(ptl);
struct mm_struct *mm = vma->vm_mm;
@@ -1433,12 +1448,12 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- ret = __pmd_trans_huge_lock(old_pmd, vma);
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &ptl);
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
out:
return ret;
@@ -1449,8 +1464,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
- if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
@@ -1466,7 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
}
set_pmd_at(mm, addr, pmd, entry);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
ret = 1;
}
@@ -1480,12 +1496,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
{
- spin_lock(&vma->vm_mm->page_table_lock);
+ *ptl = huge_pmd_lockptr(vma->vm_mm, pmd);
+ spin_lock(*ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1494,14 +1512,23 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return 1;
}
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(*ptl);
return 0;
}
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag)
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
{
pmd_t *pmd, *ret = NULL;
@@ -1511,10 +1538,12 @@ pmd_t *page_check_address_pmd(struct page *page,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
+ *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(*ptl);
if (pmd_none(*pmd))
- goto out;
+ goto unlock;
if (pmd_page(*pmd) != page)
- goto out;
+ goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1524,12 +1553,15 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto out;
+ goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
ret = pmd;
+ goto out;
}
+unlock:
+ spin_unlock(*ptl);
out:
return ret;
}
@@ -1541,14 +1573,15 @@ static int __split_huge_page_splitting(struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
int ret = 0;
+ spinlock_t *uninitialized_var(ptl);
/* For mmu_notifiers */
const unsigned long mmun_start = address;
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1559,8 +1592,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
@@ -1694,10 +1727,10 @@ static int __split_huge_page_map(struct page *page,
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
+ spinlock_t *uninitialized_var(ptl);
- spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1752,8 +1785,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -2314,7 +2347,7 @@ static void collapse_huge_page(struct mm_struct *mm,
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ spin_lock(ptl); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2322,7 +2355,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_lock(ptl);
@@ -2331,7 +2364,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2339,7 +2372,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2364,13 +2397,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*hpage = NULL;
@@ -2698,6 +2731,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
+ spinlock_t *ptl;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2706,22 +2740,23 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
diff --git v3.11-rc3.orig/mm/memcontrol.c v3.11-rc3/mm/memcontrol.c
index 00a7a66..3949444 100644
--- v3.11-rc3.orig/mm/memcontrol.c
+++ v3.11-rc3/mm/memcontrol.c
@@ -6591,12 +6591,12 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6769,7 +6769,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *uninitialized_var(ptl);
enum mc_target_type target_type;
union mc_target target;
struct page *page;
@@ -6785,9 +6785,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6804,7 +6804,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
diff --git v3.11-rc3.orig/mm/memory.c v3.11-rc3/mm/memory.c
index aa3b994..8c97ef0 100644
--- v3.11-rc3.orig/mm/memory.c
+++ v3.11-rc3/mm/memory.c
@@ -1529,20 +1529,21 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* fall through */
}
split_fallthrough:
@@ -3607,17 +3608,17 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte, *orig_pte;
unsigned long _addr = addr & PMD_MASK;
unsigned long offset;
- spinlock_t *ptl;
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmdp);
bool numa = false;
int local_nid = numa_node_id();
- spin_lock(&mm->page_table_lock);
+ spin_lock(ptl);
pmd = *pmdp;
if (pmd_numa(pmd)) {
set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
numa = true;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
if (!numa)
return 0;
diff --git v3.11-rc3.orig/mm/migrate.c v3.11-rc3/mm/migrate.c
index c69a9c7..1e1e9f2 100644
--- v3.11-rc3.orig/mm/migrate.c
+++ v3.11-rc3/mm/migrate.c
@@ -1659,6 +1659,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
+ spinlock_t *ptl;
/*
* Don't migrate pages that are mapped in multiple processes.
@@ -1699,9 +1700,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- spin_lock(&mm->page_table_lock);
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
if (unlikely(!pmd_same(*pmd, entry))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1746,7 +1748,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
unlock_page(new_page);
unlock_page(page);
diff --git v3.11-rc3.orig/mm/mprotect.c v3.11-rc3/mm/mprotect.c
index 94722a4..c65c390 100644
--- v3.11-rc3.orig/mm/mprotect.c
+++ v3.11-rc3/mm/mprotect.c
@@ -116,9 +116,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
pmd_t *pmd)
{
- spin_lock(&mm->page_table_lock);
+ spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
}
#else
static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
diff --git v3.11-rc3.orig/mm/pgtable-generic.c v3.11-rc3/mm/pgtable-generic.c
index e1a6e4f..8e49928 100644
--- v3.11-rc3.orig/mm/pgtable-generic.c
+++ v3.11-rc3/mm/pgtable-generic.c
@@ -124,11 +124,10 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* The caller must hold page table lock */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
@@ -141,13 +140,14 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* no "address" argument so destroys page coloring of some arch */
+/*
+ * no "address" argument so destroys page coloring of some arch
+ * The caller must hold page table lock.
+ */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;
- assert_spin_locked(&mm->page_table_lock);
-
/* FIFO */
pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
diff --git v3.11-rc3.orig/mm/rmap.c v3.11-rc3/mm/rmap.c
index eccec58..798f6ae 100644
--- v3.11-rc3.orig/mm/rmap.c
+++ v3.11-rc3/mm/rmap.c
@@ -666,24 +666,24 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
int referenced = 0;
+ spinlock_t *uninitialized_var(ptl);
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
- spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG);
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
goto out;
}
if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
*mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
goto out;
@@ -692,10 +692,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
} else {
pte_t *pte;
- spinlock_t *ptl;
/*
* rmap might return false positives; we must filter
--
1.8.3.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 14+ messages in thread
* RE: [PATCH 2/2] thp: support split page table lock
2013-09-05 21:27 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
@ 2013-09-06 10:48 ` Kirill A. Shutemov
2013-09-06 16:27 ` Naoya Horiguchi
2013-09-08 16:56 ` Aneesh Kumar K.V
2013-09-06 16:04 ` Alex Thorlton
1 sibling, 2 replies; 14+ messages in thread
From: Kirill A. Shutemov @ 2013-09-06 10:48 UTC (permalink / raw)
To: Naoya Horiguchi
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, Alex Thorlton, linux-kernel
Naoya Horiguchi wrote:
> Thp related code also uses per process mm->page_table_lock now.
> So making it fine-grained can provide better performance.
>
> This patch makes thp support split page table lock by using page->ptl
> of the pages storing "pmd_trans_huge" pmds.
>
> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> are expected by their caller to pass back the pointer of ptl, so this
> patch adds to those functions new arguments for that. Rather than that,
> this patch gives only straightforward replacement.
>
> ChangeLog v3:
> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Generally, looks good. Few notes:
I believe you need to convert __pte_alloc() to new locking. Not sure about
__pte_alloc_kernel().
Have you check all rest mm->page_table_lock, that they shouldn't be
converted to new locking?
You use uninitialized_var() a lot. It's ugly. I've check few places
(task_mmu.c, copy_huge_pmd) and have found a reason why we need it there.
Why?
You often do
+ ptl = huge_pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
Should we have a helper to combine them? huge_pmd_lock()?
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-05 21:27 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
2013-09-06 10:48 ` Kirill A. Shutemov
@ 2013-09-06 16:04 ` Alex Thorlton
2013-09-06 18:01 ` Naoya Horiguchi
1 sibling, 1 reply; 14+ messages in thread
From: Alex Thorlton @ 2013-09-06 16:04 UTC (permalink / raw)
To: Naoya Horiguchi
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, linux-kernel
On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> Thp related code also uses per process mm->page_table_lock now.
> So making it fine-grained can provide better performance.
>
> This patch makes thp support split page table lock by using page->ptl
> of the pages storing "pmd_trans_huge" pmds.
>
> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> are expected by their caller to pass back the pointer of ptl, so this
> patch adds to those functions new arguments for that. Rather than that,
> this patch gives only straightforward replacement.
>
> ChangeLog v3:
> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> - added missing declaration of ptl in do_huge_pmd_anonymous_page()
I've applied these and tested them using the same tests program that I
used when I was working on the same issue, and I'm running into some
bugs. Here's a stack trace:
general protection fault: 0000 [#1] SMP
Modules linked in:
CPU: 268 PID: 32381 Comm: memscale Not tainted
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
task: ffff880fbdd82180 ti: ffff880fc0c5a000 task.ti: ffff880fc0c5a000
RIP: 0010:[<ffffffff810e3eef>] [<ffffffff810e3eef>]
pgtable_trans_huge_withdraw+0x38/0x60
RSP: 0018:ffff880fc0c5bc88 EFLAGS: 00010297
RAX: ffffea17cebe8838 RBX: 00000015309bd000 RCX: ffffea01f623b028
RDX: dead000000100100 RSI: ffff8dcf77d84c30 RDI: ffff880fbda67580
RBP: ffff880fc0c5bc88 R08: 0000000000000013 R09: 0000000000014da0
R10: ffff880fc0c5bc88 R11: ffff888f7efda000 R12: ffff8dcf77d84c30
R13: ffff880fc0c5bdf8 R14: 800005cf401ff067 R15: ffff8b4de5fabff8
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc0c5bcc8 ffffffff810f7643 ffff880fc0c5bcc8 ffffffff810d8297
ffffea1456237510 00007fc7b0e00000 0000000000000000 00007fc7b0c00000
ffff880fc0c5bda8 ffffffff810d85ba ffff880fc0c5bd48 ffff880fc0c5bd68
Call Trace:
[<ffffffff810f7643>] zap_huge_pmd+0x4c/0x101
[<ffffffff810d8297>] ? tlb_flush_mmu+0x58/0x75
[<ffffffff810d85ba>] unmap_single_vma+0x306/0x7d6
[<ffffffff810d8ad9>] unmap_vmas+0x4f/0x82
[<ffffffff810dab5e>] exit_mmap+0x8b/0x113
[<ffffffff810a9743>] ? __delayacct_add_tsk+0x170/0x182
[<ffffffff8103c609>] mmput+0x3e/0xc4
[<ffffffff8104088c>] do_exit+0x380/0x907
[<ffffffff810fb89c>] ? vfs_write+0x149/0x1a3
[<ffffffff81040e85>] do_group_exit+0x72/0x9b
[<ffffffff81040ec0>] SyS_exit_group+0x12/0x16
[<ffffffff814f52d2>] system_call_fastpath+0x16/0x1b
Code: 51 20 48 8d 41 20 48 39 c2 75 0d 48 c7 87 28 03 00 00 00 00 00 00
eb 36 48 8d 42 e0 48 89 87 28 03 00 00 48 8b 51 20 48 8b 41 28 <48> 89
42 08 48 89 10 48 ba 00 01 10 00 00 00 ad de 48 b8 00 02
RIP [<ffffffff810e3eef>] pgtable_trans_huge_withdraw+0x38/0x60
RSP <ffff880fc0c5bc88>
---[ end trace e5413b388b6ea448 ]---
Fixing recursive fault but reboot is needed!
general protection fault: 0000 [#2] SMP
Modules linked in:
CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
Workqueue: events vmstat_update
task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
RIP: 0010:[<ffffffff810bcdcb>] [<ffffffff810bcdcb>]
free_pcppages_bulk+0x97/0x329
RSP: 0018:ffff880fc1a77c98 EFLAGS: 00010082
RAX: ffff880fffd94d68 RBX: dead0000002001e0 RCX: ffff880fffd94d50
RDX: ffff880fffd94d68 RSI: 000000000000001f RDI: ffff888f7efdac68
RBP: ffff880fc1a77cf8 R08: 0000000000000400 R09: ffffffff81a8bf00
R10: ffff884f7efdac00 R11: ffffffff81009bae R12: dead000000200200
R13: ffff888f7efdac00 R14: 000000000000001f R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc1a77ce8 ffff880fffd94d68 0000000000000010 ffff880fffd94d50
0000001ff9276a68 ffff880fffd94d60 0000000000000000 000000000000001f
ffff880fffd94d50 0000000000000292 ffff880fc1a77d38 ffff880fffd95d05
Call Trace:
[<ffffffff810bd149>] drain_zone_pages+0x33/0x42
[<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
[<ffffffff810cd609>] vmstat_update+0x11/0x43
[<ffffffff8105350f>] process_one_work+0x260/0x389
[<ffffffff8105381a>] worker_thread+0x1e2/0x332
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff810579df>] kthread+0xb3/0xbd
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
[<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
Code: 48 89 55 c8 48 39 14 08 74 ce 41 83 fe 03 44 0f 44 75 c4 48 83 c2
08 48 89 45 b0 48 89 55 a8 48 8b 45 a8 4c 8b 20 49 8d 5c 24 e0 <48> 8b
53 20 48 8b 43 28 48 89 42 08 48 89 10 48 ba 00 01 10 00
RIP [<ffffffff810bcdcb>] free_pcppages_bulk+0x97/0x329
RSP <ffff880fc1a77c98>
---[ end trace e5413b388b6ea449 ]---
BUG: unable to handle kernel paging request at ffffffffffffffd8
IP: [<ffffffff8105742c>] kthread_data+0xb/0x11
PGD 1a0c067 PUD 1a0e067 PMD 0
Oops: 0000 [#3] SMP
Modules linked in:
CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
3.11.0-medusa-03121-g757f8ca #184
Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
01/15/2013
task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
RIP: 0010:[<ffffffff8105742c>] [<ffffffff8105742c>]
kthread_data+0xb/0x11
RSP: 0018:ffff880fc1a77948 EFLAGS: 00010092
RAX: 0000000000000000 RBX: 000000000000010c RCX: 0000000000000000
RDX: 000000000000000f RSI: 000000000000010c RDI: ffff880fc1a74280
RBP: ffff880fc1a77948 R08: 00000000000442c8 R09: 0000000000000000
R10: dead000000200200 R11: ffff880fc1a742e8 R12: ffff880fc1a74868
R13: ffff880fffd91cc0 R14: ffff880ff9b7a040 R15: 000000000000010c
FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000028 CR3: 0000000001a0b000 CR4: 00000000000407e0
Stack:
ffff880fc1a77968 ffffffff8105151f ffff880fc1a77968 ffff880fc1a74280
ffff880fc1a77ab8 ffffffff814f2e98 ffff880fc1a76010 0000000000004000
ffff880fc1a74280 0000000000011cc0 ffff880fc1a77fd8 ffff880fc1a77fd8
Call Trace:
[<ffffffff8105151f>] wq_worker_sleeping+0x10/0x82
[<ffffffff814f2e98>] __schedule+0x1b7/0x8f7
[<ffffffff8135d4bd>] ? mix_pool_bytes+0x4a/0x56
[<ffffffff810a5d05>] ? call_rcu_sched+0x16/0x18
[<ffffffff8103f708>] ? release_task+0x3a7/0x3bf
[<ffffffff814f36b5>] schedule+0x61/0x63
[<ffffffff81040e0f>] do_exit+0x903/0x907
[<ffffffff8100529a>] oops_end+0xb9/0xc1
[<ffffffff81005393>] die+0x55/0x5e
[<ffffffff8100341a>] do_general_protection+0x93/0x139
[<ffffffff814f4d82>] general_protection+0x22/0x30
[<ffffffff81009bae>] ? default_idle+0x6/0x8
[<ffffffff810bcdcb>] ? free_pcppages_bulk+0x97/0x329
[<ffffffff810bcd5d>] ? free_pcppages_bulk+0x29/0x329
[<ffffffff810bd149>] drain_zone_pages+0x33/0x42
[<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
[<ffffffff810cd609>] vmstat_update+0x11/0x43
[<ffffffff8105350f>] process_one_work+0x260/0x389
[<ffffffff8105381a>] worker_thread+0x1e2/0x332
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff810579df>] kthread+0xb3/0xbd
[<ffffffff81053638>] ? process_one_work+0x389/0x389
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
[<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
[<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
Code: 65 48 8b 04 25 40 b7 00 00 48 8b 80 90 05 00 00 48 89 e5 48 8b 40
c8 c9 48 c1 e8 02 83 e0 01 c3 48 8b 87 90 05 00 00 55 48 89 e5 <48> 8b
40 d8 c9 c3 48 3b 3d 67 ca c2 00 55 48 89 e5 75 09 0f bf
RIP [<ffffffff8105742c>] kthread_data+0xb/0x11
RSP <ffff880fc1a77948>
CR2: ffffffffffffffd8
---[ end trace e5413b388b6ea44a ]---
Fixing recursive fault but reboot is needed!
I'm testing on a 528 core machine, with ~2TB of memory, THP on. The
test case works like this:
- Spawn 512 threads using pthread_create, pin each thread to a separate
cpu
- Each thread allocates 512mb, local to its cpu
- Threads are sent a "go" signal, all threads begin touching the first
byte of each 4k chunk of their 512mb simultaneously
I'm working on debugging the issue now, but I thought I'd get this out
to everyone in case they might have some input. I'll try and get my
test program cleaned up and posted somewhere today so that others can
try it out as well.
- Alex
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-06 10:48 ` Kirill A. Shutemov
@ 2013-09-06 16:27 ` Naoya Horiguchi
2013-09-06 16:46 ` Kirill A. Shutemov
2013-09-08 16:56 ` Aneesh Kumar K.V
1 sibling, 1 reply; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-06 16:27 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, Aneesh Kumar K.V,
Alex Thorlton, linux-kernel
Hi Kirill,
On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> Naoya Horiguchi wrote:
> > Thp related code also uses per process mm->page_table_lock now.
> > So making it fine-grained can provide better performance.
> >
> > This patch makes thp support split page table lock by using page->ptl
> > of the pages storing "pmd_trans_huge" pmds.
> >
> > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > are expected by their caller to pass back the pointer of ptl, so this
> > patch adds to those functions new arguments for that. Rather than that,
> > this patch gives only straightforward replacement.
> >
> > ChangeLog v3:
> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>
> Generally, looks good. Few notes:
>
> I believe you need to convert __pte_alloc() to new locking. Not sure about
> __pte_alloc_kernel().
> Have you check all rest mm->page_table_lock, that they shouldn't be
> converted to new locking?
I thought that keeping __pte_alloc() using mm->page_table_lock was safe
because it uses bare mm->page_table_lock instead of pte_lockptr() even
before this patchset, but not 100% sure.
__pte_alloc() (and its family) are used in normal page path, so if it's
not safe, we've lived with unsafe code for very long (maybe since 2005).
Anyway, converting __pte_alloc() into split ptl could improve performance
(though we need testing to know what amount), so I'll try that.
> You use uninitialized_var() a lot. It's ugly. I've check few places
> (task_mmu.c, copy_huge_pmd) and have found a reason why we need it there.
> Why?
I got a compile warning of uninitialized usage when developing and added
to suppress it, but in the final form I never get such a warning.
So I'll remove this uninitialized_var()s.
> You often do
>
> + ptl = huge_pmd_lockptr(mm, pmd);
> + spin_lock(ptl);
>
> Should we have a helper to combine them? huge_pmd_lock()?
OK, I'll do it.
Thanks,
Naoya Horiguchi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-06 16:27 ` Naoya Horiguchi
@ 2013-09-06 16:46 ` Kirill A. Shutemov
2013-09-06 17:15 ` Naoya Horiguchi
0 siblings, 1 reply; 14+ messages in thread
From: Kirill A. Shutemov @ 2013-09-06 16:46 UTC (permalink / raw)
To: Naoya Horiguchi
Cc: Kirill A. Shutemov, linux-mm, Andrew Morton, Mel Gorman,
Andi Kleen, Michal Hocko, KOSAKI Motohiro, Rik van Riel,
Andrea Arcangeli, Aneesh Kumar K.V, Alex Thorlton, linux-kernel
Naoya Horiguchi wrote:
> Hi Kirill,
>
> On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> > Naoya Horiguchi wrote:
> > > Thp related code also uses per process mm->page_table_lock now.
> > > So making it fine-grained can provide better performance.
> > >
> > > This patch makes thp support split page table lock by using page->ptl
> > > of the pages storing "pmd_trans_huge" pmds.
> > >
> > > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > > are expected by their caller to pass back the pointer of ptl, so this
> > > patch adds to those functions new arguments for that. Rather than that,
> > > this patch gives only straightforward replacement.
> > >
> > > ChangeLog v3:
> > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> > >
> > > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> >
> > Generally, looks good. Few notes:
> >
> > I believe you need to convert __pte_alloc() to new locking. Not sure about
> > __pte_alloc_kernel().
> > Have you check all rest mm->page_table_lock, that they shouldn't be
> > converted to new locking?
>
> I thought that keeping __pte_alloc() using mm->page_table_lock was safe
> because it uses bare mm->page_table_lock instead of pte_lockptr() even
> before this patchset, but not 100% sure.
> __pte_alloc() (and its family) are used in normal page path, so if it's
> not safe, we've lived with unsafe code for very long (maybe since 2005).
> Anyway, converting __pte_alloc() into split ptl could improve performance
> (though we need testing to know what amount), so I'll try that.
No, before the patch mm->page_table_lock is what we need: it serializes
setting up pmd, not adding pte to pmd and it's subject to change with new
locking model.
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-06 16:46 ` Kirill A. Shutemov
@ 2013-09-06 17:15 ` Naoya Horiguchi
0 siblings, 0 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-06 17:15 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, Aneesh Kumar K.V,
Alex Thorlton, linux-kernel
On Fri, Sep 06, 2013 at 07:46:25PM +0300, Kirill A. Shutemov wrote:
> Naoya Horiguchi wrote:
> > Hi Kirill,
> >
> > On Fri, Sep 06, 2013 at 01:48:03PM +0300, Kirill A. Shutemov wrote:
> > > Naoya Horiguchi wrote:
> > > > Thp related code also uses per process mm->page_table_lock now.
> > > > So making it fine-grained can provide better performance.
> > > >
> > > > This patch makes thp support split page table lock by using page->ptl
> > > > of the pages storing "pmd_trans_huge" pmds.
> > > >
> > > > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > > > are expected by their caller to pass back the pointer of ptl, so this
> > > > patch adds to those functions new arguments for that. Rather than that,
> > > > this patch gives only straightforward replacement.
> > > >
> > > > ChangeLog v3:
> > > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> > > >
> > > > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > >
> > > Generally, looks good. Few notes:
> > >
> > > I believe you need to convert __pte_alloc() to new locking. Not sure about
> > > __pte_alloc_kernel().
> > > Have you check all rest mm->page_table_lock, that they shouldn't be
> > > converted to new locking?
> >
> > I thought that keeping __pte_alloc() using mm->page_table_lock was safe
> > because it uses bare mm->page_table_lock instead of pte_lockptr() even
> > before this patchset, but not 100% sure.
> > __pte_alloc() (and its family) are used in normal page path, so if it's
> > not safe, we've lived with unsafe code for very long (maybe since 2005).
> > Anyway, converting __pte_alloc() into split ptl could improve performance
> > (though we need testing to know what amount), so I'll try that.
>
> No, before the patch mm->page_table_lock is what we need: it serializes
> setting up pmd, not adding pte to pmd and it's subject to change with new
> locking model.
OK, I see. This is a todo for the next post.
Thank you for explanation.
Naoya
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-06 16:04 ` Alex Thorlton
@ 2013-09-06 18:01 ` Naoya Horiguchi
0 siblings, 0 replies; 14+ messages in thread
From: Naoya Horiguchi @ 2013-09-06 18:01 UTC (permalink / raw)
To: Alex Thorlton
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, linux-kernel
Hi Alex,
On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> > Thp related code also uses per process mm->page_table_lock now.
> > So making it fine-grained can provide better performance.
> >
> > This patch makes thp support split page table lock by using page->ptl
> > of the pages storing "pmd_trans_huge" pmds.
> >
> > Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
> > are expected by their caller to pass back the pointer of ptl, so this
> > patch adds to those functions new arguments for that. Rather than that,
> > this patch gives only straightforward replacement.
> >
> > ChangeLog v3:
> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>
> I've applied these and tested them using the same tests program that I
> used when I was working on the same issue, and I'm running into some
> bugs. Here's a stack trace:
Thank you for helping testing. This bug is new to me.
> general protection fault: 0000 [#1] SMP
> Modules linked in:
> CPU: 268 PID: 32381 Comm: memscale Not tainted
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> task: ffff880fbdd82180 ti: ffff880fc0c5a000 task.ti: ffff880fc0c5a000
> RIP: 0010:[<ffffffff810e3eef>] [<ffffffff810e3eef>]
> pgtable_trans_huge_withdraw+0x38/0x60
> RSP: 0018:ffff880fc0c5bc88 EFLAGS: 00010297
> RAX: ffffea17cebe8838 RBX: 00000015309bd000 RCX: ffffea01f623b028
> RDX: dead000000100100 RSI: ffff8dcf77d84c30 RDI: ffff880fbda67580
> RBP: ffff880fc0c5bc88 R08: 0000000000000013 R09: 0000000000014da0
> R10: ffff880fc0c5bc88 R11: ffff888f7efda000 R12: ffff8dcf77d84c30
> R13: ffff880fc0c5bdf8 R14: 800005cf401ff067 R15: ffff8b4de5fabff8
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc0c5bcc8 ffffffff810f7643 ffff880fc0c5bcc8 ffffffff810d8297
> ffffea1456237510 00007fc7b0e00000 0000000000000000 00007fc7b0c00000
> ffff880fc0c5bda8 ffffffff810d85ba ffff880fc0c5bd48 ffff880fc0c5bd68
> Call Trace:
> [<ffffffff810f7643>] zap_huge_pmd+0x4c/0x101
> [<ffffffff810d8297>] ? tlb_flush_mmu+0x58/0x75
> [<ffffffff810d85ba>] unmap_single_vma+0x306/0x7d6
> [<ffffffff810d8ad9>] unmap_vmas+0x4f/0x82
> [<ffffffff810dab5e>] exit_mmap+0x8b/0x113
> [<ffffffff810a9743>] ? __delayacct_add_tsk+0x170/0x182
> [<ffffffff8103c609>] mmput+0x3e/0xc4
> [<ffffffff8104088c>] do_exit+0x380/0x907
> [<ffffffff810fb89c>] ? vfs_write+0x149/0x1a3
> [<ffffffff81040e85>] do_group_exit+0x72/0x9b
> [<ffffffff81040ec0>] SyS_exit_group+0x12/0x16
> [<ffffffff814f52d2>] system_call_fastpath+0x16/0x1b
> Code: 51 20 48 8d 41 20 48 39 c2 75 0d 48 c7 87 28 03 00 00 00 00 00 00
> eb 36 48 8d 42 e0 48 89 87 28 03 00 00 48 8b 51 20 48 8b 41 28 <48> 89
> 42 08 48 89 10 48 ba 00 01 10 00 00 00 ad de 48 b8 00 02
> RIP [<ffffffff810e3eef>] pgtable_trans_huge_withdraw+0x38/0x60
> RSP <ffff880fc0c5bc88>
> ---[ end trace e5413b388b6ea448 ]---
> Fixing recursive fault but reboot is needed!
> general protection fault: 0000 [#2] SMP
> Modules linked in:
> CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> Workqueue: events vmstat_update
> task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
> RIP: 0010:[<ffffffff810bcdcb>] [<ffffffff810bcdcb>]
> free_pcppages_bulk+0x97/0x329
> RSP: 0018:ffff880fc1a77c98 EFLAGS: 00010082
> RAX: ffff880fffd94d68 RBX: dead0000002001e0 RCX: ffff880fffd94d50
> RDX: ffff880fffd94d68 RSI: 000000000000001f RDI: ffff888f7efdac68
> RBP: ffff880fc1a77cf8 R08: 0000000000000400 R09: ffffffff81a8bf00
> R10: ffff884f7efdac00 R11: ffffffff81009bae R12: dead000000200200
> R13: ffff888f7efdac00 R14: 000000000000001f R15: 0000000000000000
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffff768b0b8 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc1a77ce8 ffff880fffd94d68 0000000000000010 ffff880fffd94d50
> 0000001ff9276a68 ffff880fffd94d60 0000000000000000 000000000000001f
> ffff880fffd94d50 0000000000000292 ffff880fc1a77d38 ffff880fffd95d05
> Call Trace:
> [<ffffffff810bd149>] drain_zone_pages+0x33/0x42
> [<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
> [<ffffffff810cd609>] vmstat_update+0x11/0x43
> [<ffffffff8105350f>] process_one_work+0x260/0x389
> [<ffffffff8105381a>] worker_thread+0x1e2/0x332
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff810579df>] kthread+0xb3/0xbd
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> [<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> Code: 48 89 55 c8 48 39 14 08 74 ce 41 83 fe 03 44 0f 44 75 c4 48 83 c2
> 08 48 89 45 b0 48 89 55 a8 48 8b 45 a8 4c 8b 20 49 8d 5c 24 e0 <48> 8b
> 53 20 48 8b 43 28 48 89 42 08 48 89 10 48 ba 00 01 10 00
> RIP [<ffffffff810bcdcb>] free_pcppages_bulk+0x97/0x329
> RSP <ffff880fc1a77c98>
> ---[ end trace e5413b388b6ea449 ]---
> BUG: unable to handle kernel paging request at ffffffffffffffd8
> IP: [<ffffffff8105742c>] kthread_data+0xb/0x11
> PGD 1a0c067 PUD 1a0e067 PMD 0
> Oops: 0000 [#3] SMP
> Modules linked in:
> CPU: 268 PID: 1722 Comm: kworker/268:1 Tainted: G D
> 3.11.0-medusa-03121-g757f8ca #184
> Hardware name: SGI UV2000/ROMLEY, BIOS SGI UV 2000/3000 series BIOS
> 01/15/2013
> task: ffff880fc1a74280 ti: ffff880fc1a76000 task.ti: ffff880fc1a76000
> RIP: 0010:[<ffffffff8105742c>] [<ffffffff8105742c>]
> kthread_data+0xb/0x11
> RSP: 0018:ffff880fc1a77948 EFLAGS: 00010092
> RAX: 0000000000000000 RBX: 000000000000010c RCX: 0000000000000000
> RDX: 000000000000000f RSI: 000000000000010c RDI: ffff880fc1a74280
> RBP: ffff880fc1a77948 R08: 00000000000442c8 R09: 0000000000000000
> R10: dead000000200200 R11: ffff880fc1a742e8 R12: ffff880fc1a74868
> R13: ffff880fffd91cc0 R14: ffff880ff9b7a040 R15: 000000000000010c
> FS: 0000000000000000(0000) GS:ffff880fffd80000(0000)
> knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000028 CR3: 0000000001a0b000 CR4: 00000000000407e0
> Stack:
> ffff880fc1a77968 ffffffff8105151f ffff880fc1a77968 ffff880fc1a74280
> ffff880fc1a77ab8 ffffffff814f2e98 ffff880fc1a76010 0000000000004000
> ffff880fc1a74280 0000000000011cc0 ffff880fc1a77fd8 ffff880fc1a77fd8
> Call Trace:
> [<ffffffff8105151f>] wq_worker_sleeping+0x10/0x82
> [<ffffffff814f2e98>] __schedule+0x1b7/0x8f7
> [<ffffffff8135d4bd>] ? mix_pool_bytes+0x4a/0x56
> [<ffffffff810a5d05>] ? call_rcu_sched+0x16/0x18
> [<ffffffff8103f708>] ? release_task+0x3a7/0x3bf
> [<ffffffff814f36b5>] schedule+0x61/0x63
> [<ffffffff81040e0f>] do_exit+0x903/0x907
> [<ffffffff8100529a>] oops_end+0xb9/0xc1
> [<ffffffff81005393>] die+0x55/0x5e
> [<ffffffff8100341a>] do_general_protection+0x93/0x139
> [<ffffffff814f4d82>] general_protection+0x22/0x30
> [<ffffffff81009bae>] ? default_idle+0x6/0x8
> [<ffffffff810bcdcb>] ? free_pcppages_bulk+0x97/0x329
> [<ffffffff810bcd5d>] ? free_pcppages_bulk+0x29/0x329
> [<ffffffff810bd149>] drain_zone_pages+0x33/0x42
> [<ffffffff810cd5a6>] refresh_cpu_vm_stats+0xcc/0x11e
> [<ffffffff810cd609>] vmstat_update+0x11/0x43
> [<ffffffff8105350f>] process_one_work+0x260/0x389
> [<ffffffff8105381a>] worker_thread+0x1e2/0x332
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff810579df>] kthread+0xb3/0xbd
> [<ffffffff81053638>] ? process_one_work+0x389/0x389
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> [<ffffffff814f522c>] ret_from_fork+0x7c/0xb0
> [<ffffffff8105792c>] ? kthread_freezable_should_stop+0x5b/0x5b
> Code: 65 48 8b 04 25 40 b7 00 00 48 8b 80 90 05 00 00 48 89 e5 48 8b 40
> c8 c9 48 c1 e8 02 83 e0 01 c3 48 8b 87 90 05 00 00 55 48 89 e5 <48> 8b
> 40 d8 c9 c3 48 3b 3d 67 ca c2 00 55 48 89 e5 75 09 0f bf
> RIP [<ffffffff8105742c>] kthread_data+0xb/0x11
> RSP <ffff880fc1a77948>
> CR2: ffffffffffffffd8
> ---[ end trace e5413b388b6ea44a ]---
> Fixing recursive fault but reboot is needed!
>
> I'm testing on a 528 core machine, with ~2TB of memory, THP on. The
> test case works like this:
>
> - Spawn 512 threads using pthread_create, pin each thread to a separate
> cpu
> - Each thread allocates 512mb, local to its cpu
> - Threads are sent a "go" signal, all threads begin touching the first
> byte of each 4k chunk of their 512mb simultaneously
>
> I'm working on debugging the issue now, but I thought I'd get this out
> to everyone in case they might have some input. I'll try and get my
> test program cleaned up and posted somewhere today so that others can
> try it out as well.
Thanks. Please let me know when it's available. I'll look at it.
Naoya
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* RE: [PATCH 2/2] thp: support split page table lock
2013-09-06 10:48 ` Kirill A. Shutemov
2013-09-06 16:27 ` Naoya Horiguchi
@ 2013-09-08 16:56 ` Aneesh Kumar K.V
1 sibling, 0 replies; 14+ messages in thread
From: Aneesh Kumar K.V @ 2013-09-08 16:56 UTC (permalink / raw)
To: Kirill A. Shutemov, Naoya Horiguchi
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, Alex Thorlton,
linux-kernel
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> writes:
> Naoya Horiguchi wrote:
>> Thp related code also uses per process mm->page_table_lock now.
>> So making it fine-grained can provide better performance.
>>
>> This patch makes thp support split page table lock by using page->ptl
>> of the pages storing "pmd_trans_huge" pmds.
>>
>> Some functions like pmd_trans_huge_lock() and page_check_address_pmd()
>> are expected by their caller to pass back the pointer of ptl, so this
>> patch adds to those functions new arguments for that. Rather than that,
>> this patch gives only straightforward replacement.
>>
>> ChangeLog v3:
>> - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
>> - added missing declaration of ptl in do_huge_pmd_anonymous_page()
>>
>> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>
> Generally, looks good. Few notes:
>
> I believe you need to convert __pte_alloc() to new locking. Not sure about
> __pte_alloc_kernel().
> Have you check all rest mm->page_table_lock, that they shouldn't be
> converted to new locking?
May be we can have a CONFIG_DEBUG_VM version of pmd_populate that check
check with assert_spin_locked ?
-aneesh
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
@ 2013-09-09 2:34 Daniel J Blueman
2013-09-09 15:43 ` Alex Thorlton
0 siblings, 1 reply; 14+ messages in thread
From: Daniel J Blueman @ 2013-09-09 2:34 UTC (permalink / raw)
To: Naoya Horiguchi
Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli, kirill.shutemov,
Aneesh Kumar K.V, LKML, Alex Thorlton, Steffen Persvold
On Saturday, 7 September 2013 02:10:02 UTC+8, Naoya Horiguchi wrote:
> Hi Alex,
>
> On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> > On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> > > Thp related code also uses per process mm->page_table_lock now.
> > > So making it fine-grained can provide better performance.
> > >
> > > This patch makes thp support split page table lock by using page->ptl
> > > of the pages storing "pmd_trans_huge" pmds.
> > >
> > > Some functions like pmd_trans_huge_lock() and
page_check_address_pmd()
> > > are expected by their caller to pass back the pointer of ptl, so this
> > > patch adds to those functions new arguments for that. Rather than
that,
> > > this patch gives only straightforward replacement.
> > >
> > > ChangeLog v3:
> > > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> > > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >
> > I've applied these and tested them using the same tests program that I
> > used when I was working on the same issue, and I'm running into some
> > bugs. Here's a stack trace:
>
> Thank you for helping testing. This bug is new to me.
With 3.11, this patch series and CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS, I
consistently hit the same failure when exiting one of my stress-testers
[1] when using eg 24 cores.
Doesn't happen with 8 cores, so likely needs enough virtual memory to
use multiple split locks. Otherwise, this is very promising work!
[1] http://quora.org/2013/fft3d.c
--
Daniel J Blueman
Principal Software Engineer, Numascale
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] thp: support split page table lock
2013-09-09 2:34 [PATCH 2/2] thp: support split page table lock Daniel J Blueman
@ 2013-09-09 15:43 ` Alex Thorlton
0 siblings, 0 replies; 14+ messages in thread
From: Alex Thorlton @ 2013-09-09 15:43 UTC (permalink / raw)
To: Daniel J Blueman
Cc: Naoya Horiguchi, linux-mm, Andrew Morton, Mel Gorman, Andi Kleen,
Michal Hocko, KOSAKI Motohiro, Rik van Riel, Andrea Arcangeli,
kirill.shutemov, Aneesh Kumar K.V, LKML, Steffen Persvold
On Mon, Sep 09, 2013 at 10:34:45AM +0800, Daniel J Blueman wrote:
> On Saturday, 7 September 2013 02:10:02 UTC+8, Naoya Horiguchi wrote:
> >Hi Alex,
> >
> >On Fri, Sep 06, 2013 at 11:04:23AM -0500, Alex Thorlton wrote:
> >> On Thu, Sep 05, 2013 at 05:27:46PM -0400, Naoya Horiguchi wrote:
> >> > Thp related code also uses per process mm->page_table_lock now.
> >> > So making it fine-grained can provide better performance.
> >> >
> >> > This patch makes thp support split page table lock by using page->ptl
> >> > of the pages storing "pmd_trans_huge" pmds.
> >> >
> >> > Some functions like pmd_trans_huge_lock() and
> page_check_address_pmd()
> >> > are expected by their caller to pass back the pointer of ptl, so this
> >> > patch adds to those functions new arguments for that. Rather than
> that,
> >> > this patch gives only straightforward replacement.
> >> >
> >> > ChangeLog v3:
> >> > - fixed argument of huge_pmd_lockptr() in copy_huge_pmd()
> >> > - added missing declaration of ptl in do_huge_pmd_anonymous_page()
> >>
> >> I've applied these and tested them using the same tests program that I
> >> used when I was working on the same issue, and I'm running into some
> >> bugs. Here's a stack trace:
> >
> >Thank you for helping testing. This bug is new to me.
>
> With 3.11, this patch series and CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS,
> I consistently hit the same failure when exiting one of my
> stress-testers [1] when using eg 24 cores.
>
> Doesn't happen with 8 cores, so likely needs enough virtual memory
> to use multiple split locks. Otherwise, this is very promising work!
Daniel,
Hillf Danton (dhillf@gmail.com) suggested putting the big
page_table_lock back into the two functions seen below. I re-tested
with this change and it seems to resolve the issue.
- Alex
--- a/mm/pgtable-generic.c Sat Sep 7 15:17:52 2013
+++ b/mm/pgtable-generic.c Sat Sep 7 15:20:28 2013
@@ -127,12 +127,14 @@ void pmdp_splitting_flush(struct vm_area
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
+ spin_lock(&mm->page_table_lock);
/* FIFO */
if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
else
list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
mm->pmd_huge_pte = pgtable;
+ spin_unlock(&mm->page_table_lock);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -144,6 +146,7 @@ pgtable_t pgtable_trans_huge_withdraw(st
{
pgtable_t pgtable;
+ spin_lock(&mm->page_table_lock);
/* FIFO */
pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
@@ -153,6 +156,7 @@ pgtable_t pgtable_trans_huge_withdraw(st
struct page, lru);
list_del(&pgtable->lru);
}
+ spin_unlock(&mm->page_table_lock);
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2013-09-09 15:43 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-09 2:34 [PATCH 2/2] thp: support split page table lock Daniel J Blueman
2013-09-09 15:43 ` Alex Thorlton
-- strict thread matches above, loose matches on Subject: below --
2013-09-05 21:27 [PATCH 0/2 v3] split page table lock for hugepage Naoya Horiguchi
2013-09-05 21:27 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
2013-09-06 10:48 ` Kirill A. Shutemov
2013-09-06 16:27 ` Naoya Horiguchi
2013-09-06 16:46 ` Kirill A. Shutemov
2013-09-06 17:15 ` Naoya Horiguchi
2013-09-08 16:56 ` Aneesh Kumar K.V
2013-09-06 16:04 ` Alex Thorlton
2013-09-06 18:01 ` Naoya Horiguchi
2013-08-30 17:18 [PATCH 0/2 v2] split page table lock for hugepage Naoya Horiguchi
2013-08-30 17:18 ` [PATCH 2/2] thp: support split page table lock Naoya Horiguchi
2013-09-02 10:53 ` Kirill A. Shutemov
2013-09-02 16:37 ` Naoya Horiguchi
2013-09-03 20:52 ` Naoya Horiguchi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).