[PATCH 0/2] hugetlbfs: support split page table lock

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] hugetlbfs: support split page table lock
@ 2013-05-28 19:52 Naoya Horiguchi
  2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
  0 siblings, 2 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-05-28 19:52 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

Hi,

In previous discussion [1] on "extend hugepage migration" patches, Michal and
Kosaki-san commented that in the patch "migrate: add migrate_entry_wait_huge()"
we need to solve the issue in the arch-independent manner and had better care
USE_SPLIT_PTLOCK=y case. So this patch(es) does that.

I made sure that the patched kernel shows no regression in functional tests
of libhugetlbfs.

[1]: http://thread.gmane.org/gmane.linux.kernel.mm/96665/focus=96661

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] hugetlbfs: support split page table lock
  2013-05-28 19:52 [PATCH 0/2] hugetlbfs: support split page table lock Naoya Horiguchi
@ 2013-05-28 19:52 ` Naoya Horiguchi
  2013-05-29  1:09   ` Wanpeng Li
                     ` (2 more replies)
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
  1 sibling, 3 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-05-28 19:52 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

Currently all of page table handling by hugetlbfs code are done under
mm->page_table_lock. This is not optimal because there can be lock
contentions between unrelated components using this lock.

This patch makes hugepage support split page table lock so that
we use page->ptl of the leaf node of page table tree which is pte for
normal pages but can be pmd and/or pud for hugepages of some architectures.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
 arch/x86/mm/hugetlbpage.c |  6 ++--
 include/linux/hugetlb.h   | 18 ++++++++++
 mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
 3 files changed, 73 insertions(+), 35 deletions(-)

diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c
index ae1aa71..0e4a396 100644
--- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c
+++ v3.10-rc3/arch/x86/mm/hugetlbpage.c
@@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	unsigned long saddr;
 	pte_t *spte = NULL;
 	pte_t *pte;
+	spinlock_t *ptl;
 
 	if (!vma_shareable(vma, addr))
 		return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 			spte = huge_pte_offset(svma->vm_mm, saddr);
 			if (spte) {
 				get_page(virt_to_page(spte));
+				ptl = huge_pte_lockptr(mm, spte);
 				break;
 			}
 		}
@@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	if (!spte)
 		goto out;
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (pud_none(*pud))
 		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
 	else
 		put_page(virt_to_page(spte));
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
 	mutex_unlock(&mapping->i_mmap_mutex);
diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h
index a639c87..40f3215 100644
--- v3.10-rc3.orig/include/linux/hugetlb.h
+++ v3.10-rc3/include/linux/hugetlb.h
@@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 int PageHuge(struct page *page);
 
+#if USE_SPLIT_PTLOCKS
+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
+#else	/* !USE_SPLIT_PTLOCKS */
+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
+#endif	/* USE_SPLIT_PTLOCKS */
+
+#define huge_pte_offset_lock(mm, address, ptlp)		\
+({							\
+	pte_t *__pte = huge_pte_offset(mm, address);	\
+	spinlock_t *__ptl = NULL;			\
+	if (__pte) {					\
+		__ptl = huge_pte_lockptr(mm, __pte);	\
+		*(ptlp) = __ptl;			\
+		spin_lock(__ptl);			\
+	}						\
+	__pte;						\
+})
+
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
index 463fb5e..8e1af32 100644
--- v3.10-rc3.orig/mm/hugetlb.c
+++ v3.10-rc3/mm/hugetlb.c
@@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+		spinlock_t *srcptl, *dstptl;
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
@@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		if (dst_pte == src_pte)
 			continue;
 
-		spin_lock(&dst->page_table_lock);
-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+		dstptl = huge_pte_lockptr(dst, dst_pte);
+		srcptl = huge_pte_lockptr(src, src_pte);
+		spin_lock(dstptl);
+		spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING);
 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
 			if (cow)
 				huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			page_dup_rmap(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
-		spin_unlock(&src->page_table_lock);
-		spin_unlock(&dst->page_table_lock);
+		spin_unlock(srcptl);
+		spin_unlock(dstptl);
 	}
 	return 0;
 
@@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unsigned long address;
 	pte_t *ptep;
 	pte_t pte;
+	spinlock_t *ptl;
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
@@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	tlb_start_vma(tlb, vma);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
-	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
-		ptep = huge_pte_offset(mm, address);
+		ptep = huge_pte_offset_lock(mm, address, &ptl);
 		if (!ptep)
 			continue;
 
 		if (huge_pmd_unshare(mm, &address, ptep))
-			continue;
+			goto unlock;
 
 		pte = huge_ptep_get(ptep);
 		if (huge_pte_none(pte))
-			continue;
+			goto unlock;
 
 		/*
 		 * HWPoisoned hugepage is already unmapped and dropped reference
 		 */
 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
 			huge_pte_clear(mm, address, ptep);
-			continue;
+			goto unlock;
 		}
 
 		page = pte_page(pte);
@@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 */
 		if (ref_page) {
 			if (page != ref_page)
-				continue;
+				goto unlock;
 
 			/*
 			 * Mark the VMA as having unmapped its page so that
@@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 		page_remove_rmap(page);
 		force_flush = !__tlb_remove_page(tlb, page);
-		if (force_flush)
+		if (force_flush) {
+			spin_unlock(ptl);
 			break;
+		}
 		/* Bail out after unmapping reference page if supplied */
-		if (ref_page)
+		if (ref_page) {
+			spin_unlock(ptl);
 			break;
+		}
+unlock:
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	/*
 	 * mmu_gather ran out of room to batch pages, we break out of
 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	int outside_reserve = 0;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	spinlock_t *ptl = huge_pte_lockptr(mm, ptep);
 
 	old_page = pte_page(pte);
 
@@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	page_cache_get(old_page);
 
 	/* Drop page_table_lock as buddy allocator may be called */
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	new_page = alloc_huge_page(vma, address, outside_reserve);
 
 	if (IS_ERR(new_page)) {
@@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			BUG_ON(huge_pte_none(pte));
 			if (unmap_ref_private(mm, vma, old_page, address)) {
 				BUG_ON(huge_pte_none(pte));
-				spin_lock(&mm->page_table_lock);
+				spin_lock(ptl);
 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
 					goto retry_avoidcopy;
@@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 
 		/* Caller expects lock to be held */
-		spin_lock(&mm->page_table_lock);
+		spin_lock(ptl);
 		if (err == -ENOMEM)
 			return VM_FAULT_OOM;
 		else
@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_cache_release(new_page);
 		page_cache_release(old_page);
 		/* Caller expects lock to be held */
-		spin_lock(&mm->page_table_lock);
+		spin_lock(ptl);
 		return VM_FAULT_OOM;
 	}
 
@@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Retake the page_table_lock to check for racing updates
 	 * before the page tables are altered
 	 */
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
@@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	/* Caller expects lock to be held */
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return 0;
@@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	struct address_space *mapping;
 	pte_t new_pte;
+	spinlock_t *ptl;
 
 	/*
 	 * Currently, we are forced to kill the process in the event the
@@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto backout_unlocked;
 		}
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pte_lockptr(mm, ptep);
+	spin_lock(ptl);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 		goto backout;
@@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	unlock_page(page);
 out:
 	return ret;
 
 backout:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 backout_unlocked:
 	unlock_page(page);
 	put_page(page);
@@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	pte_t *ptep;
 	pte_t entry;
+	spinlock_t *ptl;
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
@@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (page != pagecache_page)
 		lock_page(page);
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pte_lockptr(mm, ptep);
+	spin_lock(ptl);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
 		goto out_page_table_lock;
@@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		update_mmu_cache(vma, address, ptep);
 
 out_page_table_lock:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
@@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long remainder = *nr_pages;
 	struct hstate *h = hstate_vma(vma);
 
-	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
+		spinlock_t *ptl = NULL;
 		int absent;
 		struct page *page;
 
@@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * Some archs (sparc64, sh*) have multiple pte_ts to
 		 * each hugepage.  We have to make sure we get the
 		 * first, for the page indexing below to work.
+		 *
+		 * Note that page table lock is not held when pte is null.
 		 */
-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+		pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
 
 		/*
@@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (absent && (flags & FOLL_DUMP) &&
 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+			if (pte)
+				spin_unlock(ptl);
 			remainder = 0;
 			break;
 		}
@@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		      !huge_pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
-			spin_unlock(&mm->page_table_lock);
+			if (pte)
+				spin_unlock(ptl);
 			ret = hugetlb_fault(mm, vma, vaddr,
 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-			spin_lock(&mm->page_table_lock);
 			if (!(ret & VM_FAULT_ERROR))
 				continue;
 
@@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 */
 			goto same_page;
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	*nr_pages = remainder;
 	*position = vaddr;
 
@@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, address, end);
 
 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
-	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += huge_page_size(h)) {
-		ptep = huge_pte_offset(mm, address);
+		spinlock_t *ptl;
+		ptep = huge_pte_offset_lock(mm, address, &ptl);
 		if (!ptep)
 			continue;
 		if (huge_pmd_unshare(mm, &address, ptep)) {
 			pages++;
+			spin_unlock(ptl);
 			continue;
 		}
 		if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			set_huge_pte_at(mm, address, ptep, pte);
 			pages++;
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	/*
 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
 	 * may have cleared our pud entry and done put_page on the page table:
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] hugetlbfs: support split page table lock
  2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
@ 2013-05-29  1:09   ` Wanpeng Li
  2013-05-29  1:09   ` Wanpeng Li
  2013-06-03 13:19   ` Michal Hocko
  2 siblings, 0 replies; 15+ messages in thread
From: Wanpeng Li @ 2013-05-29  1:09 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

On Tue, May 28, 2013 at 03:52:50PM -0400, Naoya Horiguchi wrote:
>Currently all of page table handling by hugetlbfs code are done under
>mm->page_table_lock. This is not optimal because there can be lock
>contentions between unrelated components using this lock.
>
>This patch makes hugepage support split page table lock so that
>we use page->ptl of the leaf node of page table tree which is pte for
>normal pages but can be pmd and/or pud for hugepages of some architectures.
>

Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>---
> arch/x86/mm/hugetlbpage.c |  6 ++--
> include/linux/hugetlb.h   | 18 ++++++++++
> mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> 3 files changed, 73 insertions(+), 35 deletions(-)
>
>diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c
>index ae1aa71..0e4a396 100644
>--- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c
>+++ v3.10-rc3/arch/x86/mm/hugetlbpage.c
>@@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	unsigned long saddr;
> 	pte_t *spte = NULL;
> 	pte_t *pte;
>+	spinlock_t *ptl;
>
> 	if (!vma_shareable(vma, addr))
> 		return (pte_t *)pmd_alloc(mm, pud, addr);
>@@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 			spte = huge_pte_offset(svma->vm_mm, saddr);
> 			if (spte) {
> 				get_page(virt_to_page(spte));
>+				ptl = huge_pte_lockptr(mm, spte);
> 				break;
> 			}
> 		}
>@@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	if (!spte)
> 		goto out;
>
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	if (pud_none(*pud))
> 		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
> 	else
> 		put_page(virt_to_page(spte));
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> out:
> 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
> 	mutex_unlock(&mapping->i_mmap_mutex);
>diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h
>index a639c87..40f3215 100644
>--- v3.10-rc3.orig/include/linux/hugetlb.h
>+++ v3.10-rc3/include/linux/hugetlb.h
>@@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> int PageHuge(struct page *page);
>
>+#if USE_SPLIT_PTLOCKS
>+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
>+#else	/* !USE_SPLIT_PTLOCKS */
>+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
>+#endif	/* USE_SPLIT_PTLOCKS */
>+
>+#define huge_pte_offset_lock(mm, address, ptlp)		\
>+({							\
>+	pte_t *__pte = huge_pte_offset(mm, address);	\
>+	spinlock_t *__ptl = NULL;			\
>+	if (__pte) {					\
>+		__ptl = huge_pte_lockptr(mm, __pte);	\
>+		*(ptlp) = __ptl;			\
>+		spin_lock(__ptl);			\
>+	}						\
>+	__pte;						\
>+})
>+
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 463fb5e..8e1af32 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
>
> 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
>+		spinlock_t *srcptl, *dstptl;
> 		src_pte = huge_pte_offset(src, addr);
> 		if (!src_pte)
> 			continue;
>@@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 		if (dst_pte == src_pte)
> 			continue;
>
>-		spin_lock(&dst->page_table_lock);
>-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
>+		dstptl = huge_pte_lockptr(dst, dst_pte);
>+		srcptl = huge_pte_lockptr(src, src_pte);
>+		spin_lock(dstptl);
>+		spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING);
> 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
> 			if (cow)
> 				huge_ptep_set_wrprotect(src, addr, src_pte);
>@@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 			page_dup_rmap(ptepage);
> 			set_huge_pte_at(dst, addr, dst_pte, entry);
> 		}
>-		spin_unlock(&src->page_table_lock);
>-		spin_unlock(&dst->page_table_lock);
>+		spin_unlock(srcptl);
>+		spin_unlock(dstptl);
> 	}
> 	return 0;
>
>@@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	unsigned long address;
> 	pte_t *ptep;
> 	pte_t pte;
>+	spinlock_t *ptl;
> 	struct page *page;
> 	struct hstate *h = hstate_vma(vma);
> 	unsigned long sz = huge_page_size(h);
>@@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	tlb_start_vma(tlb, vma);
> 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> again:
>-	spin_lock(&mm->page_table_lock);
> 	for (address = start; address < end; address += sz) {
>-		ptep = huge_pte_offset(mm, address);
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
>
> 		if (huge_pmd_unshare(mm, &address, ptep))
>-			continue;
>+			goto unlock;
>
> 		pte = huge_ptep_get(ptep);
> 		if (huge_pte_none(pte))
>-			continue;
>+			goto unlock;
>
> 		/*
> 		 * HWPoisoned hugepage is already unmapped and dropped reference
> 		 */
> 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
> 			huge_pte_clear(mm, address, ptep);
>-			continue;
>+			goto unlock;
> 		}
>
> 		page = pte_page(pte);
>@@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 		 */
> 		if (ref_page) {
> 			if (page != ref_page)
>-				continue;
>+				goto unlock;
>
> 			/*
> 			 * Mark the VMA as having unmapped its page so that
>@@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>
> 		page_remove_rmap(page);
> 		force_flush = !__tlb_remove_page(tlb, page);
>-		if (force_flush)
>+		if (force_flush) {
>+			spin_unlock(ptl);
> 			break;
>+		}
> 		/* Bail out after unmapping reference page if supplied */
>-		if (ref_page)
>+		if (ref_page) {
>+			spin_unlock(ptl);
> 			break;
>+		}
>+unlock:
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * mmu_gather ran out of room to batch pages, we break out of
> 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
>@@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	int outside_reserve = 0;
> 	unsigned long mmun_start;	/* For mmu_notifiers */
> 	unsigned long mmun_end;		/* For mmu_notifiers */
>+	spinlock_t *ptl = huge_pte_lockptr(mm, ptep);
>
> 	old_page = pte_page(pte);
>
>@@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	page_cache_get(old_page);
>
> 	/* Drop page_table_lock as buddy allocator may be called */
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	new_page = alloc_huge_page(vma, address, outside_reserve);
>
> 	if (IS_ERR(new_page)) {
>@@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 			BUG_ON(huge_pte_none(pte));
> 			if (unmap_ref_private(mm, vma, old_page, address)) {
> 				BUG_ON(huge_pte_none(pte));
>-				spin_lock(&mm->page_table_lock);
>+				spin_lock(ptl);
> 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
> 					goto retry_avoidcopy;
>@@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		}
>
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		if (err == -ENOMEM)
> 			return VM_FAULT_OOM;
> 		else
>@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		page_cache_release(new_page);
> 		page_cache_release(old_page);
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		return VM_FAULT_OOM;
> 	}
>
>@@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	 * Retake the page_table_lock to check for racing updates
> 	 * before the page tables are altered
> 	 */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
> 		/* Break COW */
>@@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		/* Make the old page be freed below */
> 		new_page = old_page;
> 	}
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> 	/* Caller expects lock to be held */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	page_cache_release(new_page);
> 	page_cache_release(old_page);
> 	return 0;
>@@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	struct page *page;
> 	struct address_space *mapping;
> 	pte_t new_pte;
>+	spinlock_t *ptl;
>
> 	/*
> 	 * Currently, we are forced to kill the process in the event the
>@@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			goto backout_unlocked;
> 		}
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	size = i_size_read(mapping->host) >> huge_page_shift(h);
> 	if (idx >= size)
> 		goto backout;
>@@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
> 	}
>
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	unlock_page(page);
> out:
> 	return ret;
>
> backout:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> backout_unlocked:
> 	unlock_page(page);
> 	put_page(page);
>@@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> 	pte_t *ptep;
> 	pte_t entry;
>+	spinlock_t *ptl;
> 	int ret;
> 	struct page *page = NULL;
> 	struct page *pagecache_page = NULL;
>@@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (page != pagecache_page)
> 		lock_page(page);
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	/* Check for a racing update before calling hugetlb_cow */
> 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> 		goto out_page_table_lock;
>@@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 		update_mmu_cache(vma, address, ptep);
>
> out_page_table_lock:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
>
> 	if (pagecache_page) {
> 		unlock_page(pagecache_page);
>@@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	unsigned long remainder = *nr_pages;
> 	struct hstate *h = hstate_vma(vma);
>
>-	spin_lock(&mm->page_table_lock);
> 	while (vaddr < vma->vm_end && remainder) {
> 		pte_t *pte;
>+		spinlock_t *ptl = NULL;
> 		int absent;
> 		struct page *page;
>
>@@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 * Some archs (sparc64, sh*) have multiple pte_ts to
> 		 * each hugepage.  We have to make sure we get the
> 		 * first, for the page indexing below to work.
>+		 *
>+		 * Note that page table lock is not held when pte is null.
> 		 */
>-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
>+		pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
> 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
>
> 		/*
>@@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 */
> 		if (absent && (flags & FOLL_DUMP) &&
> 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
>+			if (pte)
>+				spin_unlock(ptl);
> 			remainder = 0;
> 			break;
> 		}
>@@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		      !huge_pte_write(huge_ptep_get(pte)))) {
> 			int ret;
>
>-			spin_unlock(&mm->page_table_lock);
>+			if (pte)
>+				spin_unlock(ptl);
> 			ret = hugetlb_fault(mm, vma, vaddr,
> 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
>-			spin_lock(&mm->page_table_lock);
> 			if (!(ret & VM_FAULT_ERROR))
> 				continue;
>
>@@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			 */
> 			goto same_page;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	*nr_pages = remainder;
> 	*position = vaddr;
>
>@@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 	flush_cache_range(vma, address, end);
>
> 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
>-	spin_lock(&mm->page_table_lock);
> 	for (; address < end; address += huge_page_size(h)) {
>-		ptep = huge_pte_offset(mm, address);
>+		spinlock_t *ptl;
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
> 		if (huge_pmd_unshare(mm, &address, ptep)) {
> 			pages++;
>+			spin_unlock(ptl);
> 			continue;
> 		}
> 		if (!huge_pte_none(huge_ptep_get(ptep))) {
>@@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 			set_huge_pte_at(mm, address, ptep, pte);
> 			pages++;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
> 	 * may have cleared our pud entry and done put_page on the page table:
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] hugetlbfs: support split page table lock
  2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
  2013-05-29  1:09   ` Wanpeng Li
@ 2013-05-29  1:09   ` Wanpeng Li
  2013-06-03 13:19   ` Michal Hocko
  2 siblings, 0 replies; 15+ messages in thread
From: Wanpeng Li @ 2013-05-29  1:09 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

On Tue, May 28, 2013 at 03:52:50PM -0400, Naoya Horiguchi wrote:
>Currently all of page table handling by hugetlbfs code are done under
>mm->page_table_lock. This is not optimal because there can be lock
>contentions between unrelated components using this lock.
>
>This patch makes hugepage support split page table lock so that
>we use page->ptl of the leaf node of page table tree which is pte for
>normal pages but can be pmd and/or pud for hugepages of some architectures.
>

Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>---
> arch/x86/mm/hugetlbpage.c |  6 ++--
> include/linux/hugetlb.h   | 18 ++++++++++
> mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> 3 files changed, 73 insertions(+), 35 deletions(-)
>
>diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c
>index ae1aa71..0e4a396 100644
>--- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c
>+++ v3.10-rc3/arch/x86/mm/hugetlbpage.c
>@@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	unsigned long saddr;
> 	pte_t *spte = NULL;
> 	pte_t *pte;
>+	spinlock_t *ptl;
>
> 	if (!vma_shareable(vma, addr))
> 		return (pte_t *)pmd_alloc(mm, pud, addr);
>@@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 			spte = huge_pte_offset(svma->vm_mm, saddr);
> 			if (spte) {
> 				get_page(virt_to_page(spte));
>+				ptl = huge_pte_lockptr(mm, spte);
> 				break;
> 			}
> 		}
>@@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	if (!spte)
> 		goto out;
>
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	if (pud_none(*pud))
> 		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
> 	else
> 		put_page(virt_to_page(spte));
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> out:
> 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
> 	mutex_unlock(&mapping->i_mmap_mutex);
>diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h
>index a639c87..40f3215 100644
>--- v3.10-rc3.orig/include/linux/hugetlb.h
>+++ v3.10-rc3/include/linux/hugetlb.h
>@@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> int PageHuge(struct page *page);
>
>+#if USE_SPLIT_PTLOCKS
>+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
>+#else	/* !USE_SPLIT_PTLOCKS */
>+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
>+#endif	/* USE_SPLIT_PTLOCKS */
>+
>+#define huge_pte_offset_lock(mm, address, ptlp)		\
>+({							\
>+	pte_t *__pte = huge_pte_offset(mm, address);	\
>+	spinlock_t *__ptl = NULL;			\
>+	if (__pte) {					\
>+		__ptl = huge_pte_lockptr(mm, __pte);	\
>+		*(ptlp) = __ptl;			\
>+		spin_lock(__ptl);			\
>+	}						\
>+	__pte;						\
>+})
>+
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 463fb5e..8e1af32 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
>
> 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
>+		spinlock_t *srcptl, *dstptl;
> 		src_pte = huge_pte_offset(src, addr);
> 		if (!src_pte)
> 			continue;
>@@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 		if (dst_pte == src_pte)
> 			continue;
>
>-		spin_lock(&dst->page_table_lock);
>-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
>+		dstptl = huge_pte_lockptr(dst, dst_pte);
>+		srcptl = huge_pte_lockptr(src, src_pte);
>+		spin_lock(dstptl);
>+		spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING);
> 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
> 			if (cow)
> 				huge_ptep_set_wrprotect(src, addr, src_pte);
>@@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 			page_dup_rmap(ptepage);
> 			set_huge_pte_at(dst, addr, dst_pte, entry);
> 		}
>-		spin_unlock(&src->page_table_lock);
>-		spin_unlock(&dst->page_table_lock);
>+		spin_unlock(srcptl);
>+		spin_unlock(dstptl);
> 	}
> 	return 0;
>
>@@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	unsigned long address;
> 	pte_t *ptep;
> 	pte_t pte;
>+	spinlock_t *ptl;
> 	struct page *page;
> 	struct hstate *h = hstate_vma(vma);
> 	unsigned long sz = huge_page_size(h);
>@@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	tlb_start_vma(tlb, vma);
> 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> again:
>-	spin_lock(&mm->page_table_lock);
> 	for (address = start; address < end; address += sz) {
>-		ptep = huge_pte_offset(mm, address);
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
>
> 		if (huge_pmd_unshare(mm, &address, ptep))
>-			continue;
>+			goto unlock;
>
> 		pte = huge_ptep_get(ptep);
> 		if (huge_pte_none(pte))
>-			continue;
>+			goto unlock;
>
> 		/*
> 		 * HWPoisoned hugepage is already unmapped and dropped reference
> 		 */
> 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
> 			huge_pte_clear(mm, address, ptep);
>-			continue;
>+			goto unlock;
> 		}
>
> 		page = pte_page(pte);
>@@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 		 */
> 		if (ref_page) {
> 			if (page != ref_page)
>-				continue;
>+				goto unlock;
>
> 			/*
> 			 * Mark the VMA as having unmapped its page so that
>@@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>
> 		page_remove_rmap(page);
> 		force_flush = !__tlb_remove_page(tlb, page);
>-		if (force_flush)
>+		if (force_flush) {
>+			spin_unlock(ptl);
> 			break;
>+		}
> 		/* Bail out after unmapping reference page if supplied */
>-		if (ref_page)
>+		if (ref_page) {
>+			spin_unlock(ptl);
> 			break;
>+		}
>+unlock:
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * mmu_gather ran out of room to batch pages, we break out of
> 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
>@@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	int outside_reserve = 0;
> 	unsigned long mmun_start;	/* For mmu_notifiers */
> 	unsigned long mmun_end;		/* For mmu_notifiers */
>+	spinlock_t *ptl = huge_pte_lockptr(mm, ptep);
>
> 	old_page = pte_page(pte);
>
>@@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	page_cache_get(old_page);
>
> 	/* Drop page_table_lock as buddy allocator may be called */
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	new_page = alloc_huge_page(vma, address, outside_reserve);
>
> 	if (IS_ERR(new_page)) {
>@@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 			BUG_ON(huge_pte_none(pte));
> 			if (unmap_ref_private(mm, vma, old_page, address)) {
> 				BUG_ON(huge_pte_none(pte));
>-				spin_lock(&mm->page_table_lock);
>+				spin_lock(ptl);
> 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
> 					goto retry_avoidcopy;
>@@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		}
>
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		if (err == -ENOMEM)
> 			return VM_FAULT_OOM;
> 		else
>@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		page_cache_release(new_page);
> 		page_cache_release(old_page);
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		return VM_FAULT_OOM;
> 	}
>
>@@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	 * Retake the page_table_lock to check for racing updates
> 	 * before the page tables are altered
> 	 */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
> 		/* Break COW */
>@@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		/* Make the old page be freed below */
> 		new_page = old_page;
> 	}
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> 	/* Caller expects lock to be held */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	page_cache_release(new_page);
> 	page_cache_release(old_page);
> 	return 0;
>@@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	struct page *page;
> 	struct address_space *mapping;
> 	pte_t new_pte;
>+	spinlock_t *ptl;
>
> 	/*
> 	 * Currently, we are forced to kill the process in the event the
>@@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			goto backout_unlocked;
> 		}
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	size = i_size_read(mapping->host) >> huge_page_shift(h);
> 	if (idx >= size)
> 		goto backout;
>@@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
> 	}
>
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	unlock_page(page);
> out:
> 	return ret;
>
> backout:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> backout_unlocked:
> 	unlock_page(page);
> 	put_page(page);
>@@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> 	pte_t *ptep;
> 	pte_t entry;
>+	spinlock_t *ptl;
> 	int ret;
> 	struct page *page = NULL;
> 	struct page *pagecache_page = NULL;
>@@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (page != pagecache_page)
> 		lock_page(page);
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	/* Check for a racing update before calling hugetlb_cow */
> 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> 		goto out_page_table_lock;
>@@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 		update_mmu_cache(vma, address, ptep);
>
> out_page_table_lock:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
>
> 	if (pagecache_page) {
> 		unlock_page(pagecache_page);
>@@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	unsigned long remainder = *nr_pages;
> 	struct hstate *h = hstate_vma(vma);
>
>-	spin_lock(&mm->page_table_lock);
> 	while (vaddr < vma->vm_end && remainder) {
> 		pte_t *pte;
>+		spinlock_t *ptl = NULL;
> 		int absent;
> 		struct page *page;
>
>@@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 * Some archs (sparc64, sh*) have multiple pte_ts to
> 		 * each hugepage.  We have to make sure we get the
> 		 * first, for the page indexing below to work.
>+		 *
>+		 * Note that page table lock is not held when pte is null.
> 		 */
>-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
>+		pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
> 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
>
> 		/*
>@@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 */
> 		if (absent && (flags & FOLL_DUMP) &&
> 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
>+			if (pte)
>+				spin_unlock(ptl);
> 			remainder = 0;
> 			break;
> 		}
>@@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		      !huge_pte_write(huge_ptep_get(pte)))) {
> 			int ret;
>
>-			spin_unlock(&mm->page_table_lock);
>+			if (pte)
>+				spin_unlock(ptl);
> 			ret = hugetlb_fault(mm, vma, vaddr,
> 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
>-			spin_lock(&mm->page_table_lock);
> 			if (!(ret & VM_FAULT_ERROR))
> 				continue;
>
>@@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			 */
> 			goto same_page;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	*nr_pages = remainder;
> 	*position = vaddr;
>
>@@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 	flush_cache_range(vma, address, end);
>
> 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
>-	spin_lock(&mm->page_table_lock);
> 	for (; address < end; address += huge_page_size(h)) {
>-		ptep = huge_pte_offset(mm, address);
>+		spinlock_t *ptl;
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
> 		if (huge_pmd_unshare(mm, &address, ptep)) {
> 			pages++;
>+			spin_unlock(ptl);
> 			continue;
> 		}
> 		if (!huge_pte_none(huge_ptep_get(ptep))) {
>@@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 			set_huge_pte_at(mm, address, ptep, pte);
> 			pages++;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
> 	 * may have cleared our pud entry and done put_page on the page table:
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] hugetlbfs: support split page table lock
  2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
  2013-05-29  1:09   ` Wanpeng Li
  2013-05-29  1:09   ` Wanpeng Li
@ 2013-06-03 13:19   ` Michal Hocko
  2013-06-03 14:34     ` Naoya Horiguchi
  2 siblings, 1 reply; 15+ messages in thread
From: Michal Hocko @ 2013-06-03 13:19 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

On Tue 28-05-13 15:52:50, Naoya Horiguchi wrote:
> Currently all of page table handling by hugetlbfs code are done under
> mm->page_table_lock. This is not optimal because there can be lock
> contentions between unrelated components using this lock.

While I agree with such a change in general I am a bit afraid of all
subtle tweaks in the mm code that make hugetlb special. Maybe there are
none for page_table_lock but I am not 100% sure. So this might be
really tricky and it is not necessary for your further patches, is it?

How have you tested this?

> This patch makes hugepage support split page table lock so that
> we use page->ptl of the leaf node of page table tree which is pte for
> normal pages but can be pmd and/or pud for hugepages of some architectures.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> ---
>  arch/x86/mm/hugetlbpage.c |  6 ++--
>  include/linux/hugetlb.h   | 18 ++++++++++
>  mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------

This doesn't seem to be the complete story. At least not from the
trivial:
$ find arch/ -name "*hugetlb*" | xargs git grep "page_table_lock" -- 
arch/powerpc/mm/hugetlbpage.c:  spin_lock(&mm->page_table_lock);
arch/powerpc/mm/hugetlbpage.c:  spin_unlock(&mm->page_table_lock);
arch/tile/mm/hugetlbpage.c:             spin_lock(&mm->page_table_lock);
arch/tile/mm/hugetlbpage.c:
spin_unlock(&mm->page_table_lock);
arch/x86/mm/hugetlbpage.c: * called with vma->vm_mm->page_table_lock held.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] hugetlbfs: support split page table lock
  2013-06-03 13:19   ` Michal Hocko
@ 2013-06-03 14:34     ` Naoya Horiguchi
  2013-06-03 15:42       ` Michal Hocko
  0 siblings, 1 reply; 15+ messages in thread
From: Naoya Horiguchi @ 2013-06-03 14:34 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

On Mon, Jun 03, 2013 at 03:19:32PM +0200, Michal Hocko wrote:
> On Tue 28-05-13 15:52:50, Naoya Horiguchi wrote:
> > Currently all of page table handling by hugetlbfs code are done under
> > mm->page_table_lock. This is not optimal because there can be lock
> > contentions between unrelated components using this lock.
> 
> While I agree with such a change in general I am a bit afraid of all
> subtle tweaks in the mm code that make hugetlb special. Maybe there are
> none for page_table_lock but I am not 100% sure. So this might be
> really tricky and it is not necessary for your further patches, is it?

No, this page_table_lock patch is separable from migration stuff.
As you said in another email, changes going to stable should be minimal,
so it's better to make 2/2 patch not depend on this patch.

> How have you tested this?

Other than libhugetlbfs test (that contains many workloads, but I'm
not sure it can detect the possible regression of this patch,)
I did simple testing where:
 - create a file on hugetlbfs,
 - create 10 processes and make each of them iterate the following:
   * mmap() the hugetlbfs file,
   * memset() the mapped range (to cause hugetlb_fault), and
   * munmap() the mapped range.
I think that this can make racy situation which should be prevented
by page table locks.

> > This patch makes hugepage support split page table lock so that
> > we use page->ptl of the leaf node of page table tree which is pte for
> > normal pages but can be pmd and/or pud for hugepages of some architectures.
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > ---
> >  arch/x86/mm/hugetlbpage.c |  6 ++--
> >  include/linux/hugetlb.h   | 18 ++++++++++
> >  mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> 
> This doesn't seem to be the complete story. At least not from the
> trivial:
> $ find arch/ -name "*hugetlb*" | xargs git grep "page_table_lock" -- 
> arch/powerpc/mm/hugetlbpage.c:  spin_lock(&mm->page_table_lock);
> arch/powerpc/mm/hugetlbpage.c:  spin_unlock(&mm->page_table_lock);
> arch/tile/mm/hugetlbpage.c:             spin_lock(&mm->page_table_lock);
> arch/tile/mm/hugetlbpage.c:
> spin_unlock(&mm->page_table_lock);
> arch/x86/mm/hugetlbpage.c: * called with vma->vm_mm->page_table_lock held.

This trivials should be fixed. Sorry.

Naoya

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] hugetlbfs: support split page table lock
  2013-06-03 14:34     ` Naoya Horiguchi
@ 2013-06-03 15:42       ` Michal Hocko
  0 siblings, 0 replies; 15+ messages in thread
From: Michal Hocko @ 2013-06-03 15:42 UTC (permalink / raw)
  To: Naoya Horiguchi, Andrew Morton
  Cc: linux-mm, Mel Gorman, Andi Kleen, KOSAKI Motohiro, Rik van Riel,
	linux-kernel

On Mon 03-06-13 10:34:35, Naoya Horiguchi wrote:
> On Mon, Jun 03, 2013 at 03:19:32PM +0200, Michal Hocko wrote:
> > On Tue 28-05-13 15:52:50, Naoya Horiguchi wrote:
> > > Currently all of page table handling by hugetlbfs code are done under
> > > mm->page_table_lock. This is not optimal because there can be lock
> > > contentions between unrelated components using this lock.
> > 
> > While I agree with such a change in general I am a bit afraid of all
> > subtle tweaks in the mm code that make hugetlb special. Maybe there are
> > none for page_table_lock but I am not 100% sure. So this might be
> > really tricky and it is not necessary for your further patches, is it?
> 
> No, this page_table_lock patch is separable from migration stuff.
> As you said in another email, changes going to stable should be minimal,
> so it's better to make 2/2 patch not depend on this patch.

OK, so I do we go around this. Both patches are in the mm tree now.
Should Andrew just drop the current version and you repost a new
version? Sorry I didn't jump in sooner but I was quite busy last week.

> > How have you tested this?
> 
> Other than libhugetlbfs test (that contains many workloads, but I'm
> not sure it can detect the possible regression of this patch,)
> I did simple testing where:
>  - create a file on hugetlbfs,
>  - create 10 processes and make each of them iterate the following:
>    * mmap() the hugetlbfs file,
>    * memset() the mapped range (to cause hugetlb_fault), and
>    * munmap() the mapped range.
> I think that this can make racy situation which should be prevented
> by page table locks.

OK, but this still requires a deep inspection of all the subtle
dependencies on page_table_lock from the core mm. I might be wrong here
and should be more specific about the issues I have only suspicion for
but as this is "just" an scalability improvement (is this actually
measurable?) I would suggest to put it at the end of your hugetlbfs
enahcements for the migration. Just from the reviewability point of
view.

> > > This patch makes hugepage support split page table lock so that
> > > we use page->ptl of the leaf node of page table tree which is pte for
> > > normal pages but can be pmd and/or pud for hugepages of some architectures.
> > > 
> > > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > > ---
> > >  arch/x86/mm/hugetlbpage.c |  6 ++--
> > >  include/linux/hugetlb.h   | 18 ++++++++++
> > >  mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> > 
> > This doesn't seem to be the complete story. At least not from the
> > trivial:
> > $ find arch/ -name "*hugetlb*" | xargs git grep "page_table_lock" -- 
> > arch/powerpc/mm/hugetlbpage.c:  spin_lock(&mm->page_table_lock);
> > arch/powerpc/mm/hugetlbpage.c:  spin_unlock(&mm->page_table_lock);
> > arch/tile/mm/hugetlbpage.c:             spin_lock(&mm->page_table_lock);
> > arch/tile/mm/hugetlbpage.c:
> > spin_unlock(&mm->page_table_lock);
> > arch/x86/mm/hugetlbpage.c: * called with vma->vm_mm->page_table_lock held.
> 
> This trivials should be fixed. Sorry.

Other archs are often forgotten and cscope doesn't help exactly ;)

Thanks
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-28 19:52 [PATCH 0/2] hugetlbfs: support split page table lock Naoya Horiguchi
  2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
@ 2013-05-28 19:52 ` Naoya Horiguchi
  2013-05-29  1:11   ` Wanpeng Li
                     ` (3 more replies)
  1 sibling, 4 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-05-28 19:52 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

When we have a page fault for the address which is backed by a hugepage
under migration, the kernel can't wait correctly and do busy looping on
hugepage fault until the migration finishes.
This is because pte_offset_map_lock() can't get a correct migration entry
or a correct page table lock for hugepage.
This patch introduces migration_entry_wait_huge() to solve this.

Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
entry with huge_pte_offset() inside which all the arch-dependency of
the page table structure are. So migration_entry_wait_huge() and
__migration_entry_wait() are free from arch-dependency.

ChangeLog v3:
 - use huge_pte_lockptr

ChangeLog v2:
 - remove dup in migrate_entry_wait_huge()

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: stable@vger.kernel.org # 2.6.35
---
 include/linux/swapops.h |  3 +++
 mm/hugetlb.c            |  2 +-
 mm/migrate.c            | 23 ++++++++++++++++++-----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git v3.10-rc3.orig/include/linux/swapops.h v3.10-rc3/include/linux/swapops.h
index 47ead51..c5fd30d 100644
--- v3.10-rc3.orig/include/linux/swapops.h
+++ v3.10-rc3/include/linux/swapops.h
@@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
+extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
 #else
 
 #define make_migration_entry(page, write) swp_entry(0, 0)
@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
+static inline void migration_entry_wait_huge(struct mm_struct *mm,
+					pte_t *pte) { }
 static inline int is_write_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
index 8e1af32..d91a438 100644
--- v3.10-rc3.orig/mm/hugetlb.c
+++ v3.10-rc3/mm/hugetlb.c
@@ -2877,7 +2877,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait(mm, (pmd_t *)ptep, address);
+			migration_entry_wait_huge(mm, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
diff --git v3.10-rc3.orig/mm/migrate.c v3.10-rc3/mm/migrate.c
index 6f2df6e..64ff118 100644
--- v3.10-rc3.orig/mm/migrate.c
+++ v3.10-rc3/mm/migrate.c
@@ -204,15 +204,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
  * get to the page and wait until migration is finished.
  * When we return from this function the fault will be retried.
  */
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+				spinlock_t *ptl)
 {
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
+	pte_t pte;
 	swp_entry_t entry;
 	struct page *page;
 
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	spin_lock(ptl);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
 		goto out;
@@ -240,6 +239,20 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	pte_unmap_unlock(ptep, ptl);
 }
 
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	spinlock_t *ptl = pte_lockptr(mm, pmd);
+	pte_t *ptep = pte_offset_map(pmd, address);
+	__migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+	spinlock_t *ptl = huge_pte_lockptr(mm, pte);
+	__migration_entry_wait(mm, pte, ptl);
+}
+
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
@ 2013-05-29  1:11   ` Wanpeng Li
  2013-05-29  1:11   ` Wanpeng Li
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Wanpeng Li @ 2013-05-29  1:11 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

On Tue, May 28, 2013 at 03:52:51PM -0400, Naoya Horiguchi wrote:
>When we have a page fault for the address which is backed by a hugepage
>under migration, the kernel can't wait correctly and do busy looping on
>hugepage fault until the migration finishes.
>This is because pte_offset_map_lock() can't get a correct migration entry
>or a correct page table lock for hugepage.
>This patch introduces migration_entry_wait_huge() to solve this.
>
>Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
>entry with huge_pte_offset() inside which all the arch-dependency of
>the page table structure are. So migration_entry_wait_huge() and
>__migration_entry_wait() are free from arch-dependency.
>
>ChangeLog v3:
> - use huge_pte_lockptr
>
>ChangeLog v2:
> - remove dup in migrate_entry_wait_huge()
>

Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>Reviewed-by: Rik van Riel <riel@redhat.com>
>Cc: stable@vger.kernel.org # 2.6.35
>---
> include/linux/swapops.h |  3 +++
> mm/hugetlb.c            |  2 +-
> mm/migrate.c            | 23 ++++++++++++++++++-----
> 3 files changed, 22 insertions(+), 6 deletions(-)
>
>diff --git v3.10-rc3.orig/include/linux/swapops.h v3.10-rc3/include/linux/swapops.h
>index 47ead51..c5fd30d 100644
>--- v3.10-rc3.orig/include/linux/swapops.h
>+++ v3.10-rc3/include/linux/swapops.h
>@@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
>
> extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 					unsigned long address);
>+extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
> #else
>
> #define make_migration_entry(page, write) swp_entry(0, 0)
>@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp)
> static inline void make_migration_entry_read(swp_entry_t *entryp) { }
> static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 					 unsigned long address) { }
>+static inline void migration_entry_wait_huge(struct mm_struct *mm,
>+					pte_t *pte) { }
> static inline int is_write_migration_entry(swp_entry_t entry)
> {
> 	return 0;
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 8e1af32..d91a438 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2877,7 +2877,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (ptep) {
> 		entry = huge_ptep_get(ptep);
> 		if (unlikely(is_hugetlb_entry_migration(entry))) {
>-			migration_entry_wait(mm, (pmd_t *)ptep, address);
>+			migration_entry_wait_huge(mm, ptep);
> 			return 0;
> 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
> 			return VM_FAULT_HWPOISON_LARGE |
>diff --git v3.10-rc3.orig/mm/migrate.c v3.10-rc3/mm/migrate.c
>index 6f2df6e..64ff118 100644
>--- v3.10-rc3.orig/mm/migrate.c
>+++ v3.10-rc3/mm/migrate.c
>@@ -204,15 +204,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
>  * get to the page and wait until migration is finished.
>  * When we return from this function the fault will be retried.
>  */
>-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>-				unsigned long address)
>+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
>+				spinlock_t *ptl)
> {
>-	pte_t *ptep, pte;
>-	spinlock_t *ptl;
>+	pte_t pte;
> 	swp_entry_t entry;
> 	struct page *page;
>
>-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>+	spin_lock(ptl);
> 	pte = *ptep;
> 	if (!is_swap_pte(pte))
> 		goto out;
>@@ -240,6 +239,20 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 	pte_unmap_unlock(ptep, ptl);
> }
>
>+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>+				unsigned long address)
>+{
>+	spinlock_t *ptl = pte_lockptr(mm, pmd);
>+	pte_t *ptep = pte_offset_map(pmd, address);
>+	__migration_entry_wait(mm, ptep, ptl);
>+}
>+
>+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
>+{
>+	spinlock_t *ptl = huge_pte_lockptr(mm, pte);
>+	__migration_entry_wait(mm, pte, ptl);
>+}
>+
> #ifdef CONFIG_BLOCK
> /* Returns true if all buffers are successfully locked */
> static bool buffer_migrate_lock_buffers(struct buffer_head *head,
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
  2013-05-29  1:11   ` Wanpeng Li
@ 2013-05-29  1:11   ` Wanpeng Li
  2013-05-31 19:30   ` Andrew Morton
  2013-06-03 13:26   ` Michal Hocko
  3 siblings, 0 replies; 15+ messages in thread
From: Wanpeng Li @ 2013-05-29  1:11 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, Michal Hocko,
	KOSAKI Motohiro, Rik van Riel, linux-kernel

On Tue, May 28, 2013 at 03:52:51PM -0400, Naoya Horiguchi wrote:
>When we have a page fault for the address which is backed by a hugepage
>under migration, the kernel can't wait correctly and do busy looping on
>hugepage fault until the migration finishes.
>This is because pte_offset_map_lock() can't get a correct migration entry
>or a correct page table lock for hugepage.
>This patch introduces migration_entry_wait_huge() to solve this.
>
>Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
>entry with huge_pte_offset() inside which all the arch-dependency of
>the page table structure are. So migration_entry_wait_huge() and
>__migration_entry_wait() are free from arch-dependency.
>
>ChangeLog v3:
> - use huge_pte_lockptr
>
>ChangeLog v2:
> - remove dup in migrate_entry_wait_huge()
>

Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>Reviewed-by: Rik van Riel <riel@redhat.com>
>Cc: stable@vger.kernel.org # 2.6.35
>---
> include/linux/swapops.h |  3 +++
> mm/hugetlb.c            |  2 +-
> mm/migrate.c            | 23 ++++++++++++++++++-----
> 3 files changed, 22 insertions(+), 6 deletions(-)
>
>diff --git v3.10-rc3.orig/include/linux/swapops.h v3.10-rc3/include/linux/swapops.h
>index 47ead51..c5fd30d 100644
>--- v3.10-rc3.orig/include/linux/swapops.h
>+++ v3.10-rc3/include/linux/swapops.h
>@@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
>
> extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 					unsigned long address);
>+extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
> #else
>
> #define make_migration_entry(page, write) swp_entry(0, 0)
>@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp)
> static inline void make_migration_entry_read(swp_entry_t *entryp) { }
> static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 					 unsigned long address) { }
>+static inline void migration_entry_wait_huge(struct mm_struct *mm,
>+					pte_t *pte) { }
> static inline int is_write_migration_entry(swp_entry_t entry)
> {
> 	return 0;
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 8e1af32..d91a438 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2877,7 +2877,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (ptep) {
> 		entry = huge_ptep_get(ptep);
> 		if (unlikely(is_hugetlb_entry_migration(entry))) {
>-			migration_entry_wait(mm, (pmd_t *)ptep, address);
>+			migration_entry_wait_huge(mm, ptep);
> 			return 0;
> 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
> 			return VM_FAULT_HWPOISON_LARGE |
>diff --git v3.10-rc3.orig/mm/migrate.c v3.10-rc3/mm/migrate.c
>index 6f2df6e..64ff118 100644
>--- v3.10-rc3.orig/mm/migrate.c
>+++ v3.10-rc3/mm/migrate.c
>@@ -204,15 +204,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
>  * get to the page and wait until migration is finished.
>  * When we return from this function the fault will be retried.
>  */
>-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>-				unsigned long address)
>+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
>+				spinlock_t *ptl)
> {
>-	pte_t *ptep, pte;
>-	spinlock_t *ptl;
>+	pte_t pte;
> 	swp_entry_t entry;
> 	struct page *page;
>
>-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>+	spin_lock(ptl);
> 	pte = *ptep;
> 	if (!is_swap_pte(pte))
> 		goto out;
>@@ -240,6 +239,20 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> 	pte_unmap_unlock(ptep, ptl);
> }
>
>+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>+				unsigned long address)
>+{
>+	spinlock_t *ptl = pte_lockptr(mm, pmd);
>+	pte_t *ptep = pte_offset_map(pmd, address);
>+	__migration_entry_wait(mm, ptep, ptl);
>+}
>+
>+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
>+{
>+	spinlock_t *ptl = huge_pte_lockptr(mm, pte);
>+	__migration_entry_wait(mm, pte, ptl);
>+}
>+
> #ifdef CONFIG_BLOCK
> /* Returns true if all buffers are successfully locked */
> static bool buffer_migrate_lock_buffers(struct buffer_head *head,
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
  2013-05-29  1:11   ` Wanpeng Li
  2013-05-29  1:11   ` Wanpeng Li
@ 2013-05-31 19:30   ` Andrew Morton
  2013-05-31 19:46     ` Naoya Horiguchi
  2013-06-03 13:26   ` Michal Hocko
  3 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2013-05-31 19:30 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Mel Gorman, Andi Kleen, Michal Hocko, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

On Tue, 28 May 2013 15:52:51 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:

> When we have a page fault for the address which is backed by a hugepage
> under migration, the kernel can't wait correctly and do busy looping on
> hugepage fault until the migration finishes.
> This is because pte_offset_map_lock() can't get a correct migration entry
> or a correct page table lock for hugepage.
> This patch introduces migration_entry_wait_huge() to solve this.
> 
> Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
> entry with huge_pte_offset() inside which all the arch-dependency of
> the page table structure are. So migration_entry_wait_huge() and
> __migration_entry_wait() are free from arch-dependency.
> 
> ChangeLog v3:
>  - use huge_pte_lockptr
> 
> ChangeLog v2:
>  - remove dup in migrate_entry_wait_huge()
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Reviewed-by: Rik van Riel <riel@redhat.com>
> Cc: stable@vger.kernel.org # 2.6.35

That changelog is not suitable for a -stable patch.  People who
maintain and utilize the stable trees need to know in some detail what
is the end-user impact of the patch (or, equivalently, of the bug
which the patch fixes).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-31 19:30   ` Andrew Morton
@ 2013-05-31 19:46     ` Naoya Horiguchi
  0 siblings, 0 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-05-31 19:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, Mel Gorman, Andi Kleen, Michal Hocko, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

> That changelog is not suitable for a -stable patch.  People who
> maintain and utilize the stable trees need to know in some detail what
> is the end-user impact of the patch (or, equivalently, of the bug
> which the patch fixes).

OK. So, could you insert the following sentence?

On Fri, May 31, 2013 at 12:30:03PM -0700, Andrew Morton wrote:
> On Tue, 28 May 2013 15:52:51 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:
> 
> > When we have a page fault for the address which is backed by a hugepage
> > under migration, the kernel can't wait correctly and do busy looping on
> > hugepage fault until the migration finishes.

As a result, users who try to kick hugepage migration (via soft offlining,
for example) occasionally experience long delay or soft lockup.

Thanks,
Naoya

> > This is because pte_offset_map_lock() can't get a correct migration entry
> > or a correct page table lock for hugepage.
> > This patch introduces migration_entry_wait_huge() to solve this.
> > 
> > Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
> > entry with huge_pte_offset() inside which all the arch-dependency of
> > the page table structure are. So migration_entry_wait_huge() and
> > __migration_entry_wait() are free from arch-dependency.
> > 
> > ChangeLog v3:
> >  - use huge_pte_lockptr
> > 
> > ChangeLog v2:
> >  - remove dup in migrate_entry_wait_huge()
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Reviewed-by: Rik van Riel <riel@redhat.com>
> > Cc: stable@vger.kernel.org # 2.6.35
> 
> That changelog is not suitable for a -stable patch.  People who
> maintain and utilize the stable trees need to know in some detail what
> is the end-user impact of the patch (or, equivalently, of the bug
> which the patch fixes).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
                     ` (2 preceding siblings ...)
  2013-05-31 19:30   ` Andrew Morton
@ 2013-06-03 13:26   ` Michal Hocko
  2013-06-03 14:34     ` Naoya Horiguchi
  2013-06-04 16:44     ` [PATCH v4] " Naoya Horiguchi
  3 siblings, 2 replies; 15+ messages in thread
From: Michal Hocko @ 2013-06-03 13:26 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

On Tue 28-05-13 15:52:51, Naoya Horiguchi wrote:
> When we have a page fault for the address which is backed by a hugepage
> under migration, the kernel can't wait correctly and do busy looping on
> hugepage fault until the migration finishes.
> This is because pte_offset_map_lock() can't get a correct migration entry
> or a correct page table lock for hugepage.
> This patch introduces migration_entry_wait_huge() to solve this.
> 
> Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
> entry with huge_pte_offset() inside which all the arch-dependency of
> the page table structure are. So migration_entry_wait_huge() and
> __migration_entry_wait() are free from arch-dependency.
> 
> ChangeLog v3:
>  - use huge_pte_lockptr
> 
> ChangeLog v2:
>  - remove dup in migrate_entry_wait_huge()
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Reviewed-by: Rik van Riel <riel@redhat.com>
> Cc: stable@vger.kernel.org # 2.6.35

OK, this looks good to me and I guess you can safely replace
huge_pte_lockptr by &(mm)->page_table_lock so you can implement this
even without risky 1/2 of this series. The patch should be as simple as
possible especially when it goes to the stable.

Without 1/2 dependency
Reviewed-by: Michal Hocko <mhocko@suse.cz>

> ---
>  include/linux/swapops.h |  3 +++
>  mm/hugetlb.c            |  2 +-
>  mm/migrate.c            | 23 ++++++++++++++++++-----
>  3 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git v3.10-rc3.orig/include/linux/swapops.h v3.10-rc3/include/linux/swapops.h
> index 47ead51..c5fd30d 100644
> --- v3.10-rc3.orig/include/linux/swapops.h
> +++ v3.10-rc3/include/linux/swapops.h
> @@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
>  
>  extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  					unsigned long address);
> +extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
>  #else
>  
>  #define make_migration_entry(page, write) swp_entry(0, 0)
> @@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp)
>  static inline void make_migration_entry_read(swp_entry_t *entryp) { }
>  static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  					 unsigned long address) { }
> +static inline void migration_entry_wait_huge(struct mm_struct *mm,
> +					pte_t *pte) { }
>  static inline int is_write_migration_entry(swp_entry_t entry)
>  {
>  	return 0;
> diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
> index 8e1af32..d91a438 100644
> --- v3.10-rc3.orig/mm/hugetlb.c
> +++ v3.10-rc3/mm/hugetlb.c
> @@ -2877,7 +2877,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	if (ptep) {
>  		entry = huge_ptep_get(ptep);
>  		if (unlikely(is_hugetlb_entry_migration(entry))) {
> -			migration_entry_wait(mm, (pmd_t *)ptep, address);
> +			migration_entry_wait_huge(mm, ptep);
>  			return 0;
>  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
>  			return VM_FAULT_HWPOISON_LARGE |
> diff --git v3.10-rc3.orig/mm/migrate.c v3.10-rc3/mm/migrate.c
> index 6f2df6e..64ff118 100644
> --- v3.10-rc3.orig/mm/migrate.c
> +++ v3.10-rc3/mm/migrate.c
> @@ -204,15 +204,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
>   * get to the page and wait until migration is finished.
>   * When we return from this function the fault will be retried.
>   */
> -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> -				unsigned long address)
> +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
> +				spinlock_t *ptl)
>  {
> -	pte_t *ptep, pte;
> -	spinlock_t *ptl;
> +	pte_t pte;
>  	swp_entry_t entry;
>  	struct page *page;
>  
> -	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
> +	spin_lock(ptl);
>  	pte = *ptep;
>  	if (!is_swap_pte(pte))
>  		goto out;
> @@ -240,6 +239,20 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>  	pte_unmap_unlock(ptep, ptl);
>  }
>  
> +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> +				unsigned long address)
> +{
> +	spinlock_t *ptl = pte_lockptr(mm, pmd);
> +	pte_t *ptep = pte_offset_map(pmd, address);
> +	__migration_entry_wait(mm, ptep, ptl);
> +}
> +
> +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
> +{
> +	spinlock_t *ptl = huge_pte_lockptr(mm, pte);
> +	__migration_entry_wait(mm, pte, ptl);
> +}
> +
>  #ifdef CONFIG_BLOCK
>  /* Returns true if all buffers are successfully locked */
>  static bool buffer_migrate_lock_buffers(struct buffer_head *head,
> -- 
> 1.7.11.7
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3 2/2] migrate: add migrate_entry_wait_huge()
  2013-06-03 13:26   ` Michal Hocko
@ 2013-06-03 14:34     ` Naoya Horiguchi
  2013-06-04 16:44     ` [PATCH v4] " Naoya Horiguchi
  1 sibling, 0 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-06-03 14:34 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

On Mon, Jun 03, 2013 at 03:26:41PM +0200, Michal Hocko wrote:
> On Tue 28-05-13 15:52:51, Naoya Horiguchi wrote:
> > When we have a page fault for the address which is backed by a hugepage
> > under migration, the kernel can't wait correctly and do busy looping on
> > hugepage fault until the migration finishes.
> > This is because pte_offset_map_lock() can't get a correct migration entry
> > or a correct page table lock for hugepage.
> > This patch introduces migration_entry_wait_huge() to solve this.
> > 
> > Note that the caller, hugetlb_fault(), gets the pointer to the "leaf"
> > entry with huge_pte_offset() inside which all the arch-dependency of
> > the page table structure are. So migration_entry_wait_huge() and
> > __migration_entry_wait() are free from arch-dependency.
> > 
> > ChangeLog v3:
> >  - use huge_pte_lockptr
> > 
> > ChangeLog v2:
> >  - remove dup in migrate_entry_wait_huge()
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Reviewed-by: Rik van Riel <riel@redhat.com>
> > Cc: stable@vger.kernel.org # 2.6.35
> 
> OK, this looks good to me and I guess you can safely replace
> huge_pte_lockptr by &(mm)->page_table_lock so you can implement this
> even without risky 1/2 of this series. The patch should be as simple as
> possible especially when it goes to the stable.

Yes, I agree.

> > Without 1/2 dependency
> Reviewed-by: Michal Hocko <mhocko@suse.cz>

Thanks,
Naoya

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v4] migrate: add migrate_entry_wait_huge()
  2013-06-03 13:26   ` Michal Hocko
  2013-06-03 14:34     ` Naoya Horiguchi
@ 2013-06-04 16:44     ` Naoya Horiguchi
  1 sibling, 0 replies; 15+ messages in thread
From: Naoya Horiguchi @ 2013-06-04 16:44 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andi Kleen, KOSAKI Motohiro,
	Rik van Riel, linux-kernel

Here is the revised one. Andrew, could you replace the following
patches in your tree with this?

  migrate-add-migrate_entry_wait_huge.patch
  hugetlbfs-support-split-page-table-lock.patch

Thanks,
Naoya Horiguchi
----
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 4 Jun 2013 12:26:32 -0400
Subject: [PATCH v4] migrate: add migrate_entry_wait_huge()

When we have a page fault for the address which is backed by a hugepage
under migration, the kernel can't wait correctly and do busy looping on
hugepage fault until the migration finishes.
As a result, users who try to kick hugepage migration (via soft offlining,
for example) occasionally experience long delay or soft lockup.

This is because pte_offset_map_lock() can't get a correct migration entry
or a correct page table lock for hugepage.
This patch introduces migration_entry_wait_huge() to solve this.

ChangeLog v4:
 - replace huge_pte_lockptr with &(mm)->page_table_lock
   (postponed split page table lock patch)
 - update description

ChangeLog v3:
 - use huge_pte_lockptr

ChangeLog v2:
 - remove dup in migrate_entry_wait_huge()

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: stable@vger.kernel.org # 2.6.35
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
---
 include/linux/swapops.h |  3 +++
 mm/hugetlb.c            |  2 +-
 mm/migrate.c            | 23 ++++++++++++++++++-----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 47ead51..c5fd30d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
+extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
 #else
 
 #define make_migration_entry(page, write) swp_entry(0, 0)
@@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
+static inline void migration_entry_wait_huge(struct mm_struct *mm,
+					pte_t *pte) { }
 static inline int is_write_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 463fb5e..d793c5e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2865,7 +2865,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait(mm, (pmd_t *)ptep, address);
+			migration_entry_wait_huge(mm, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f2df6e..b8d56a1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -204,15 +204,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
  * get to the page and wait until migration is finished.
  * When we return from this function the fault will be retried.
  */
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+				spinlock_t *ptl)
 {
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
+	pte_t pte;
 	swp_entry_t entry;
 	struct page *page;
 
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	spin_lock(ptl);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
 		goto out;
@@ -240,6 +239,20 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	pte_unmap_unlock(ptep, ptl);
 }
 
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	spinlock_t *ptl = pte_lockptr(mm, pmd);
+	pte_t *ptep = pte_offset_map(pmd, address);
+	__migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+	spinlock_t *ptl = &(mm)->page_table_lock;
+	__migration_entry_wait(mm, pte, ptl);
+}
+
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2013-06-04 16:45 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-05-28 19:52 [PATCH 0/2] hugetlbfs: support split page table lock Naoya Horiguchi
2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
2013-05-29  1:09   ` Wanpeng Li
2013-05-29  1:09   ` Wanpeng Li
2013-06-03 13:19   ` Michal Hocko
2013-06-03 14:34     ` Naoya Horiguchi
2013-06-03 15:42       ` Michal Hocko
2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
2013-05-29  1:11   ` Wanpeng Li
2013-05-29  1:11   ` Wanpeng Li
2013-05-31 19:30   ` Andrew Morton
2013-05-31 19:46     ` Naoya Horiguchi
2013-06-03 13:26   ` Michal Hocko
2013-06-03 14:34     ` Naoya Horiguchi
2013-06-04 16:44     ` [PATCH v4] " Naoya Horiguchi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).