All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
To: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mgorman@suse.de>, Andi Kleen <andi@firstfloor.org>,
	Michal Hocko <mhocko@suse.cz>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Rik van Riel <riel@redhat.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/2] hugetlbfs: support split page table lock
Date: Wed, 29 May 2013 09:09:59 +0800	[thread overview]
Message-ID: <20130529010959.GA6530@hacker.(null)> (raw)
In-Reply-To: <1369770771-8447-2-git-send-email-n-horiguchi@ah.jp.nec.com>

On Tue, May 28, 2013 at 03:52:50PM -0400, Naoya Horiguchi wrote:
>Currently all of page table handling by hugetlbfs code are done under
>mm->page_table_lock. This is not optimal because there can be lock
>contentions between unrelated components using this lock.
>
>This patch makes hugepage support split page table lock so that
>we use page->ptl of the leaf node of page table tree which is pte for
>normal pages but can be pmd and/or pud for hugepages of some architectures.
>

Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>---
> arch/x86/mm/hugetlbpage.c |  6 ++--
> include/linux/hugetlb.h   | 18 ++++++++++
> mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> 3 files changed, 73 insertions(+), 35 deletions(-)
>
>diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c
>index ae1aa71..0e4a396 100644
>--- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c
>+++ v3.10-rc3/arch/x86/mm/hugetlbpage.c
>@@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	unsigned long saddr;
> 	pte_t *spte = NULL;
> 	pte_t *pte;
>+	spinlock_t *ptl;
>
> 	if (!vma_shareable(vma, addr))
> 		return (pte_t *)pmd_alloc(mm, pud, addr);
>@@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 			spte = huge_pte_offset(svma->vm_mm, saddr);
> 			if (spte) {
> 				get_page(virt_to_page(spte));
>+				ptl = huge_pte_lockptr(mm, spte);
> 				break;
> 			}
> 		}
>@@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	if (!spte)
> 		goto out;
>
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	if (pud_none(*pud))
> 		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
> 	else
> 		put_page(virt_to_page(spte));
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> out:
> 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
> 	mutex_unlock(&mapping->i_mmap_mutex);
>diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h
>index a639c87..40f3215 100644
>--- v3.10-rc3.orig/include/linux/hugetlb.h
>+++ v3.10-rc3/include/linux/hugetlb.h
>@@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> int PageHuge(struct page *page);
>
>+#if USE_SPLIT_PTLOCKS
>+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
>+#else	/* !USE_SPLIT_PTLOCKS */
>+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
>+#endif	/* USE_SPLIT_PTLOCKS */
>+
>+#define huge_pte_offset_lock(mm, address, ptlp)		\
>+({							\
>+	pte_t *__pte = huge_pte_offset(mm, address);	\
>+	spinlock_t *__ptl = NULL;			\
>+	if (__pte) {					\
>+		__ptl = huge_pte_lockptr(mm, __pte);	\
>+		*(ptlp) = __ptl;			\
>+		spin_lock(__ptl);			\
>+	}						\
>+	__pte;						\
>+})
>+
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 463fb5e..8e1af32 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
>
> 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
>+		spinlock_t *srcptl, *dstptl;
> 		src_pte = huge_pte_offset(src, addr);
> 		if (!src_pte)
> 			continue;
>@@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 		if (dst_pte == src_pte)
> 			continue;
>
>-		spin_lock(&dst->page_table_lock);
>-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
>+		dstptl = huge_pte_lockptr(dst, dst_pte);
>+		srcptl = huge_pte_lockptr(src, src_pte);
>+		spin_lock(dstptl);
>+		spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING);
> 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
> 			if (cow)
> 				huge_ptep_set_wrprotect(src, addr, src_pte);
>@@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 			page_dup_rmap(ptepage);
> 			set_huge_pte_at(dst, addr, dst_pte, entry);
> 		}
>-		spin_unlock(&src->page_table_lock);
>-		spin_unlock(&dst->page_table_lock);
>+		spin_unlock(srcptl);
>+		spin_unlock(dstptl);
> 	}
> 	return 0;
>
>@@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	unsigned long address;
> 	pte_t *ptep;
> 	pte_t pte;
>+	spinlock_t *ptl;
> 	struct page *page;
> 	struct hstate *h = hstate_vma(vma);
> 	unsigned long sz = huge_page_size(h);
>@@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	tlb_start_vma(tlb, vma);
> 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> again:
>-	spin_lock(&mm->page_table_lock);
> 	for (address = start; address < end; address += sz) {
>-		ptep = huge_pte_offset(mm, address);
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
>
> 		if (huge_pmd_unshare(mm, &address, ptep))
>-			continue;
>+			goto unlock;
>
> 		pte = huge_ptep_get(ptep);
> 		if (huge_pte_none(pte))
>-			continue;
>+			goto unlock;
>
> 		/*
> 		 * HWPoisoned hugepage is already unmapped and dropped reference
> 		 */
> 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
> 			huge_pte_clear(mm, address, ptep);
>-			continue;
>+			goto unlock;
> 		}
>
> 		page = pte_page(pte);
>@@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 		 */
> 		if (ref_page) {
> 			if (page != ref_page)
>-				continue;
>+				goto unlock;
>
> 			/*
> 			 * Mark the VMA as having unmapped its page so that
>@@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>
> 		page_remove_rmap(page);
> 		force_flush = !__tlb_remove_page(tlb, page);
>-		if (force_flush)
>+		if (force_flush) {
>+			spin_unlock(ptl);
> 			break;
>+		}
> 		/* Bail out after unmapping reference page if supplied */
>-		if (ref_page)
>+		if (ref_page) {
>+			spin_unlock(ptl);
> 			break;
>+		}
>+unlock:
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * mmu_gather ran out of room to batch pages, we break out of
> 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
>@@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	int outside_reserve = 0;
> 	unsigned long mmun_start;	/* For mmu_notifiers */
> 	unsigned long mmun_end;		/* For mmu_notifiers */
>+	spinlock_t *ptl = huge_pte_lockptr(mm, ptep);
>
> 	old_page = pte_page(pte);
>
>@@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	page_cache_get(old_page);
>
> 	/* Drop page_table_lock as buddy allocator may be called */
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	new_page = alloc_huge_page(vma, address, outside_reserve);
>
> 	if (IS_ERR(new_page)) {
>@@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 			BUG_ON(huge_pte_none(pte));
> 			if (unmap_ref_private(mm, vma, old_page, address)) {
> 				BUG_ON(huge_pte_none(pte));
>-				spin_lock(&mm->page_table_lock);
>+				spin_lock(ptl);
> 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
> 					goto retry_avoidcopy;
>@@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		}
>
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		if (err == -ENOMEM)
> 			return VM_FAULT_OOM;
> 		else
>@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		page_cache_release(new_page);
> 		page_cache_release(old_page);
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		return VM_FAULT_OOM;
> 	}
>
>@@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	 * Retake the page_table_lock to check for racing updates
> 	 * before the page tables are altered
> 	 */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
> 		/* Break COW */
>@@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		/* Make the old page be freed below */
> 		new_page = old_page;
> 	}
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> 	/* Caller expects lock to be held */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	page_cache_release(new_page);
> 	page_cache_release(old_page);
> 	return 0;
>@@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	struct page *page;
> 	struct address_space *mapping;
> 	pte_t new_pte;
>+	spinlock_t *ptl;
>
> 	/*
> 	 * Currently, we are forced to kill the process in the event the
>@@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			goto backout_unlocked;
> 		}
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	size = i_size_read(mapping->host) >> huge_page_shift(h);
> 	if (idx >= size)
> 		goto backout;
>@@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
> 	}
>
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	unlock_page(page);
> out:
> 	return ret;
>
> backout:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> backout_unlocked:
> 	unlock_page(page);
> 	put_page(page);
>@@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> 	pte_t *ptep;
> 	pte_t entry;
>+	spinlock_t *ptl;
> 	int ret;
> 	struct page *page = NULL;
> 	struct page *pagecache_page = NULL;
>@@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (page != pagecache_page)
> 		lock_page(page);
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	/* Check for a racing update before calling hugetlb_cow */
> 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> 		goto out_page_table_lock;
>@@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 		update_mmu_cache(vma, address, ptep);
>
> out_page_table_lock:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
>
> 	if (pagecache_page) {
> 		unlock_page(pagecache_page);
>@@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	unsigned long remainder = *nr_pages;
> 	struct hstate *h = hstate_vma(vma);
>
>-	spin_lock(&mm->page_table_lock);
> 	while (vaddr < vma->vm_end && remainder) {
> 		pte_t *pte;
>+		spinlock_t *ptl = NULL;
> 		int absent;
> 		struct page *page;
>
>@@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 * Some archs (sparc64, sh*) have multiple pte_ts to
> 		 * each hugepage.  We have to make sure we get the
> 		 * first, for the page indexing below to work.
>+		 *
>+		 * Note that page table lock is not held when pte is null.
> 		 */
>-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
>+		pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
> 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
>
> 		/*
>@@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 */
> 		if (absent && (flags & FOLL_DUMP) &&
> 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
>+			if (pte)
>+				spin_unlock(ptl);
> 			remainder = 0;
> 			break;
> 		}
>@@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		      !huge_pte_write(huge_ptep_get(pte)))) {
> 			int ret;
>
>-			spin_unlock(&mm->page_table_lock);
>+			if (pte)
>+				spin_unlock(ptl);
> 			ret = hugetlb_fault(mm, vma, vaddr,
> 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
>-			spin_lock(&mm->page_table_lock);
> 			if (!(ret & VM_FAULT_ERROR))
> 				continue;
>
>@@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			 */
> 			goto same_page;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	*nr_pages = remainder;
> 	*position = vaddr;
>
>@@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 	flush_cache_range(vma, address, end);
>
> 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
>-	spin_lock(&mm->page_table_lock);
> 	for (; address < end; address += huge_page_size(h)) {
>-		ptep = huge_pte_offset(mm, address);
>+		spinlock_t *ptl;
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
> 		if (huge_pmd_unshare(mm, &address, ptep)) {
> 			pages++;
>+			spin_unlock(ptl);
> 			continue;
> 		}
> 		if (!huge_pte_none(huge_ptep_get(ptep))) {
>@@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 			set_huge_pte_at(mm, address, ptep, pte);
> 			pages++;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
> 	 * may have cleared our pud entry and done put_page on the page table:
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-05-29  1:10 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-05-28 19:52 [PATCH 0/2] hugetlbfs: support split page table lock Naoya Horiguchi
2013-05-28 19:52 ` Naoya Horiguchi
2013-05-28 19:52 ` [PATCH 1/2] " Naoya Horiguchi
2013-05-28 19:52   ` Naoya Horiguchi
2013-05-29  1:09   ` Wanpeng Li
2013-05-29  1:09   ` Wanpeng Li [this message]
2013-06-03 13:19   ` Michal Hocko
2013-06-03 13:19     ` Michal Hocko
2013-06-03 14:34     ` Naoya Horiguchi
2013-06-03 14:34       ` Naoya Horiguchi
2013-06-03 15:42       ` Michal Hocko
2013-06-03 15:42         ` Michal Hocko
2013-05-28 19:52 ` [PATCH v3 2/2] migrate: add migrate_entry_wait_huge() Naoya Horiguchi
2013-05-28 19:52   ` Naoya Horiguchi
2013-05-29  1:11   ` Wanpeng Li
2013-05-29  1:11   ` Wanpeng Li
2013-05-31 19:30   ` Andrew Morton
2013-05-31 19:30     ` Andrew Morton
2013-05-31 19:46     ` Naoya Horiguchi
2013-05-31 19:46       ` Naoya Horiguchi
2013-06-03 13:26   ` Michal Hocko
2013-06-03 13:26     ` Michal Hocko
2013-06-03 14:34     ` Naoya Horiguchi
2013-06-03 14:34       ` Naoya Horiguchi
2013-06-04 16:44     ` [PATCH v4] " Naoya Horiguchi
2013-06-04 16:44       ` Naoya Horiguchi
  -- strict thread matches above, loose matches on Subject: below --
2013-08-30 17:18 [PATCH 0/2 v2] split page table lock for hugepage Naoya Horiguchi
2013-08-30 17:18 ` [PATCH 1/2] hugetlbfs: support split page table lock Naoya Horiguchi
2013-08-30 17:18   ` Naoya Horiguchi
2013-09-04  7:13   ` Aneesh Kumar K.V
2013-09-04  7:13     ` Aneesh Kumar K.V
2013-09-04 16:32     ` Naoya Horiguchi
2013-09-04 16:32       ` Naoya Horiguchi
2013-09-05  9:18       ` Aneesh Kumar K.V
2013-09-05  9:18         ` Aneesh Kumar K.V
2013-09-05 15:23         ` Naoya Horiguchi
2013-09-05 15:23           ` Naoya Horiguchi
2013-09-05 21:27 [PATCH 0/2 v3] split page table lock for hugepage Naoya Horiguchi
2013-09-05 21:27 ` [PATCH 1/2] hugetlbfs: support split page table lock Naoya Horiguchi
2013-09-05 21:27   ` Naoya Horiguchi
2013-09-08 16:53   ` Aneesh Kumar K.V
2013-09-08 16:53     ` Aneesh Kumar K.V
2013-09-09 16:26     ` Naoya Horiguchi
2013-09-09 16:26       ` Naoya Horiguchi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='20130529010959.GA6530@hacker.(null)' \
    --to=liwanp@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.cz \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.