[rfc] more granular page table lock for hugepages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [rfc] more granular page table lock for hugepages
@ 2007-10-08 22:52 Siddha, Suresh B
  2007-10-09 20:23 ` Ken Chen
  0 siblings, 1 reply; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-08 22:52 UTC (permalink / raw)
  To: linux-mm

Appended patch is a quick prototype which extends the concept of separate
spinlock per page table page to hugepages. More granular spinlock will
be used to guard the page table entries in the pmd page, instead of using the
mm's single page_table_lock.

For the threaded OLTP workload, this patch showed a 2.4% througput
improvement on a 128GB x86_64 system.

Appended patch is for i386/x86_64 and need more work to make it generic
for all architectures.

Note: To make use of this optimization, pmd page table page need to be
allocated using regular page allocation routines and not through slab cache
(As the spinlock in struct page overlaps with the slab meta-data).
For example, powerpc allocates pmd through the slab cache. Perhaps
we need to change the pmd allocation in powerpc or use the
global page_table_lock for now.

Before we clean it up and make it generic enough to cover all the
architectures supporting hugepages, wanted to run this by the experts
in linux-mm.

Comments?

thanks,
suresh
---
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index efdf95a..1d2d3be 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -117,7 +117,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	if (page_count(virt_to_page(ptep)) == 1)
 		return 0;
 
+	spin_lock(&mm->page_table_lock);
 	pud_clear(pud);
+	spin_unlock(&mm->page_table_lock);
 	put_page(virt_to_page(ptep));
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
@@ -134,7 +136,37 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	if (pud) {
 		if (pud_none(*pud))
 			huge_pmd_share(mm, addr, pud);
-		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+		if (pud_none(*pud)) {
+			pte = (pte_t *) pmd_alloc(mm, pud, addr);
+			pte_lock_init(virt_to_page(pte));
+		} else
+			pte = (pte_t *) pmd_offset(pud, addr);
+	}
+	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+	return pte;
+}
+
+pte_t *huge_pte_alloc_lock(struct mm_struct *mm, unsigned long addr, spinlock_t **ptlp)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		spinlock_t *ptl;
+		if (pud_none(*pud))
+			huge_pmd_share(mm, addr, pud);
+		if (pud_none(*pud)) {
+			pte = (pte_t *) pmd_alloc(mm, pud, addr);
+			pte_lock_init(virt_to_page(pte));
+		} else
+			pte = (pte_t *) pmd_offset(pud, addr);
+		ptl = pte_lockptr(mm, (pmd_t *)pud);
+		*ptlp = ptl;
+		spin_lock(ptl);
 	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
@@ -156,6 +188,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
+pte_t *huge_pte_offset_lock(struct mm_struct *mm, unsigned long addr, spinlock_t **ptlp)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_present(*pgd)) {
+		pud = pud_offset(pgd, addr);
+		if (pud_present(*pud)) {
+			spinlock_t *ptl = pte_lockptr(mm, (pmd_t *)pud);
+			*ptlp = ptl;
+			pmd = pmd_offset(pud, addr);
+			spin_lock(ptl);
+		}
+	}
+	return (pte_t *) pmd;
+}
+
 #if 0	/* This is just for testing */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 0a71e0b..c6271f3 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -345,7 +345,7 @@ static inline int pmd_large(pmd_t pte) {
 
 /* PMD  - Level 2 access */
 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
-#define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+#define pmd_page(pmd)		(pfn_to_page(pmd_pfn(pmd)))
 
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 #define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2c13715..3abcc3f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,7 +35,9 @@ extern int sysctl_hugetlb_shm_group;
 /* arch callbacks */
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_alloc_lock(struct mm_struct *mm, unsigned long addr, spinlock_t **ptlp);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_offset_lock(struct mm_struct *mm, unsigned long addr, spinlock_t **ptlp);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a45d1f0..b1d27e7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -344,14 +344,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		spinlock_t *sptl;
+		spinlock_t *dptl;
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
-		dst_pte = huge_pte_alloc(dst, addr);
+		dst_pte = huge_pte_alloc_lock(dst, addr, &dptl);
 		if (!dst_pte)
 			goto nomem;
-		spin_lock(&dst->page_table_lock);
-		spin_lock(&src->page_table_lock);
+		if (src_pte != dst_pte)
+			huge_pte_offset_lock(src, addr, &sptl);
 		if (!pte_none(*src_pte)) {
 			if (cow)
 				ptep_set_wrprotect(src, addr, src_pte);
@@ -360,8 +362,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			get_page(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
-		spin_unlock(&src->page_table_lock);
-		spin_unlock(&dst->page_table_lock);
+		if (src_pte != dst_pte)
+			spin_unlock(sptl);
+		spin_unlock(dptl);
 	}
 	return 0;
 
@@ -378,6 +381,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t pte;
 	struct page *page;
 	struct page *tmp;
+	spinlock_t *ptl;
+
 	/*
 	 * A page gathering list, protected by per file i_mmap_lock. The
 	 * lock is used to avoid list corruption from multiple unmapping
@@ -389,7 +394,6 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
-	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
@@ -398,7 +402,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 		if (huge_pmd_unshare(mm, &address, ptep))
 			continue;
 
+		ptep = huge_pte_offset_lock(mm, address, &ptl);
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
+
+		spin_unlock(ptl);
+
 		if (pte_none(pte))
 			continue;
 
@@ -407,7 +415,6 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			set_page_dirty(page);
 		list_add(&page->lru, &page_list);
 	}
-	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
@@ -438,6 +445,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	int avoidcopy;
+	spinlock_t *ptl;
 
 	old_page = pte_page(pte);
 
@@ -457,11 +465,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	spin_unlock(&mm->page_table_lock);
 	copy_huge_page(new_page, old_page, address, vma);
-	spin_lock(&mm->page_table_lock);
 
-	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	ptep = huge_pte_offset_lock(mm, address & HPAGE_MASK, &ptl);
 	if (likely(pte_same(*ptep, pte))) {
 		/* Break COW */
 		set_huge_pte_at(mm, address, ptep,
@@ -471,6 +477,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	page_cache_release(new_page);
 	page_cache_release(old_page);
+	spin_unlock(ptl);
 	return VM_FAULT_MINOR;
 }
 
@@ -483,6 +490,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	struct address_space *mapping;
 	pte_t new_pte;
+	spinlock_t *ptl;
 
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -523,31 +531,32 @@ retry:
 			lock_page(page);
 	}
 
-	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
 		goto backout;
 
 	ret = VM_FAULT_MINOR;
-	if (!pte_none(*ptep))
+	huge_pte_offset_lock(mm, address, &ptl);
+	if (!pte_none(*ptep)) {
+		spin_unlock(ptl);
 		goto backout;
+	}
 
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
 
+	spin_unlock(ptl);
 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
 	}
 
-	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 out:
 	return ret;
 
 backout:
-	spin_unlock(&mm->page_table_lock);
 	hugetlb_put_quota(mapping);
 	unlock_page(page);
 	put_page(page);
@@ -561,6 +570,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+	spinlock_t *ptl;
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
@@ -572,8 +582,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * the same page in the page cache.
 	 */
 	mutex_lock(&hugetlb_instantiation_mutex);
+	ptep = huge_pte_offset_lock(mm, address, &ptl);
 	entry = *ptep;
 	if (pte_none(entry)) {
+		spin_unlock(ptl);
 		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
 		mutex_unlock(&hugetlb_instantiation_mutex);
 		return ret;
@@ -581,12 +593,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	ret = VM_FAULT_MINOR;
 
-	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
-	if (likely(pte_same(entry, *ptep)))
-		if (write_access && !pte_write(entry))
+	if (likely(pte_same(entry, *ptep))) {
+		if (write_access && !pte_write(entry)) {
+			spin_unlock(ptl);
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
-	spin_unlock(&mm->page_table_lock);
+		} else
+			spin_unlock(ptl);
+	} else
+		spin_unlock(ptl);
+
 	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
@@ -599,8 +615,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
+	spinlock_t *ptl;
 
-	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
 		struct page *page;
@@ -610,14 +626,14 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make * sure we get the
 		 * first, for the page indexing below to work.
 		 */
-		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+		pte = huge_pte_offset_lock(mm, vaddr & HPAGE_MASK, &ptl);
 
 		if (!pte || pte_none(*pte)) {
 			int ret;
 
-			spin_unlock(&mm->page_table_lock);
+			if (pte)
+				spin_unlock(ptl);
 			ret = hugetlb_fault(mm, vma, vaddr, 0);
-			spin_lock(&mm->page_table_lock);
 			if (ret == VM_FAULT_MINOR)
 				continue;
 
@@ -650,8 +666,8 @@ same_page:
 			 */
 			goto same_page;
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	*length = remainder;
 	*position = vaddr;
 
@@ -670,21 +686,23 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, address, end);
 
 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += HPAGE_SIZE) {
+		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
 		if (huge_pmd_unshare(mm, &address, ptep))
 			continue;
+
+		ptep = huge_pte_offset_lock(mm, address, &ptl);
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
 			set_huge_pte_at(mm, address, ptep, pte);
 			lazy_mmu_prot_update(pte);
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 
 	flush_tlb_range(vma, start, end);
diff --git a/mm/memory.c b/mm/memory.c
index f64cbf9..1bde136 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2645,6 +2645,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	pte_lock_init(virt_to_page(new));
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-08 22:52 [rfc] more granular page table lock for hugepages Siddha, Suresh B
@ 2007-10-09 20:23 ` Ken Chen
  2007-10-09 21:05   ` Badari Pulavarty
  0 siblings, 1 reply; 30+ messages in thread
From: Ken Chen @ 2007-10-09 20:23 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: linux-mm

On 10/8/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> Appended patch is a quick prototype which extends the concept of separate
> spinlock per page table page to hugepages. More granular spinlock will
> be used to guard the page table entries in the pmd page, instead of using the
> mm's single page_table_lock.

What path do you content on mm->page_table_lock?

The major fault for hugetlb page is blanket by
hugetlb_instantiation_mutex.  So likelihood of contention on
page_table spin lock is low.  For minor fault, I would think
mapping->i_mmap_lock will kick in before page table lock.  That left
follow_hugetlb_page path.  Is it the case?

Also are you contending within hugetlb regions, or contenting with
other vma regions?

- Ken

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-09 20:23 ` Ken Chen
@ 2007-10-09 21:05   ` Badari Pulavarty
  2007-10-10  0:15     ` Siddha, Suresh B
  0 siblings, 1 reply; 30+ messages in thread
From: Badari Pulavarty @ 2007-10-09 21:05 UTC (permalink / raw)
  To: Ken Chen; +Cc: Siddha, Suresh B, linux-mm

On Tue, 2007-10-09 at 13:23 -0700, Ken Chen wrote:
> On 10/8/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> > Appended patch is a quick prototype which extends the concept of separate
> > spinlock per page table page to hugepages. More granular spinlock will
> > be used to guard the page table entries in the pmd page, instead of using the
> > mm's single page_table_lock.
> 
> What path do you content on mm->page_table_lock?
> 
> The major fault for hugetlb page is blanket by
> hugetlb_instantiation_mutex.  So likelihood of contention on
> page_table spin lock is low.  For minor fault, I would think
> mapping->i_mmap_lock will kick in before page table lock.  That left
> follow_hugetlb_page path.  Is it the case?

Yes. follow_hugetlb_page() is where our benchmark team has seen
contention with threaded workload.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-09 21:05   ` Badari Pulavarty
@ 2007-10-10  0:15     ` Siddha, Suresh B
  2007-10-10  6:10       ` Ken Chen
  0 siblings, 1 reply; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-10  0:15 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Ken Chen, Siddha, Suresh B, linux-mm

On Tue, Oct 09, 2007 at 02:05:57PM -0700, Badari Pulavarty wrote:
> On Tue, 2007-10-09 at 13:23 -0700, Ken Chen wrote:
> > On 10/8/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> > > Appended patch is a quick prototype which extends the concept of separate
> > > spinlock per page table page to hugepages. More granular spinlock will
> > > be used to guard the page table entries in the pmd page, instead of using the
> > > mm's single page_table_lock.
> > 
> > What path do you content on mm->page_table_lock?
> > 
> > The major fault for hugetlb page is blanket by
> > hugetlb_instantiation_mutex.  So likelihood of contention on
> > page_table spin lock is low.  For minor fault, I would think
> > mapping->i_mmap_lock will kick in before page table lock.  That left
> > follow_hugetlb_page path.  Is it the case?
> 
> Yes. follow_hugetlb_page() is where our benchmark team has seen
> contention with threaded workload.

That's correct. And the direct IO leading to those calls.

thanks,
suresh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-10  0:15     ` Siddha, Suresh B
@ 2007-10-10  6:10       ` Ken Chen
  2007-10-10  7:50         ` Ken Chen
  0 siblings, 1 reply; 30+ messages in thread
From: Ken Chen @ 2007-10-10  6:10 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Badari Pulavarty, linux-mm

On 10/9/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> > Yes. follow_hugetlb_page() is where our benchmark team has seen
> > contention with threaded workload.
>
> That's correct. And the direct IO leading to those calls.

That's what I figures.  In that case, why don't we get rid of all spin
lock in the fast path of follow_hugetlb_pages.

follow_hugetlb_page is called from get_user_pages, which should
already hold mm->mmap_sem in read mode.  That means page table tear
down can not happen.  We do a racy read on page table chain.  If a
race happened with another thread, no big deal, it will just fall into
hugetlb_fault() which will then serialize with
hugetlb_instantiation_mutex or mm->page_table_lock.  And that's slow
path anyway.

- Ken

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-10  6:10       ` Ken Chen
@ 2007-10-10  7:50         ` Ken Chen
  2007-10-11 11:39           ` Nick Piggin
  0 siblings, 1 reply; 30+ messages in thread
From: Ken Chen @ 2007-10-10  7:50 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Badari Pulavarty, linux-mm

On 10/9/07, Ken Chen <kenchen@google.com> wrote:
> That's what I figures.  In that case, why don't we get rid of all spin
> lock in the fast path of follow_hugetlb_pages.
>
> follow_hugetlb_page is called from get_user_pages, which should
> already hold mm->mmap_sem in read mode.  That means page table tear
> down can not happen.  We do a racy read on page table chain.  If a
> race happened with another thread, no big deal, it will just fall into
> hugetlb_fault() which will then serialize with
> hugetlb_instantiation_mutex or mm->page_table_lock.  And that's slow
> path anyway.

never mind.  ftruncate can come through in another path removes
mapping without holding mm->mmap_sem.  So much for the crazy idea.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-10  7:50         ` Ken Chen
@ 2007-10-11 11:39           ` Nick Piggin
  2007-10-12 20:34             ` Siddha, Suresh B
  0 siblings, 1 reply; 30+ messages in thread
From: Nick Piggin @ 2007-10-11 11:39 UTC (permalink / raw)
  To: Ken Chen; +Cc: Siddha, Suresh B, Badari Pulavarty, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1952 bytes --]

On Wednesday 10 October 2007 17:50, Ken Chen wrote:
> On 10/9/07, Ken Chen <kenchen@google.com> wrote:
> > That's what I figures.  In that case, why don't we get rid of all spin
> > lock in the fast path of follow_hugetlb_pages.
> >
> > follow_hugetlb_page is called from get_user_pages, which should
> > already hold mm->mmap_sem in read mode.  That means page table tear
> > down can not happen.  We do a racy read on page table chain.  If a
> > race happened with another thread, no big deal, it will just fall into
> > hugetlb_fault() which will then serialize with
> > hugetlb_instantiation_mutex or mm->page_table_lock.  And that's slow
> > path anyway.
>
> never mind.  ftruncate can come through in another path removes
> mapping without holding mm->mmap_sem.  So much for the crazy idea.

Yeah, that's a killer...

Here is another crazy idea I've been mulling around. I was on
the brink of forgetting the whole thing until Suresh just now
showed how much performance there is to be had.

I don't suppose the mmap_sem avoidance from this patch matters
so much if your database isn't using threads. But at least it
should be faster (unless my crazy idea has some huge hole, and
provided hugepages are implemented).

Basic idea is that architectures can override get_user_pages.
Or at least, a fast if not complete version and subsequently
fall back to regular get_user_pages if it encounters something
difficult (eg. a swapped out page).

I *think* we can do this for x86-64 without taking mmap_sem, or
_any_ page table locks at all. Obviously the CPUs themselves do
a very similar lockless lookup for TLB fill.

[ We actually might even be able to go one better if we could have
  virt->phys instructions in the CPU that would lookup and even
  fill the TLB for us. I don't know what the chances of that
  happening are, Suresh ;) ]

Attached is the really basic sketch of how it will work. Any
party poopers care tell me why I'm an idiot? :)


[-- Attachment #2: mm-get_user_pages-fast.patch --]
[-- Type: text/x-diff, Size: 3280 bytes --]

Index: linux-2.6/arch/x86/lib/Makefile_64
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile_64
+++ linux-2.6/arch/x86/lib/Makefile_64
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup.o
Index: linux-2.6/arch/x86/lib/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/lib/gup.c
@@ -0,0 +1,99 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(struct mm_struct *mm, pmd_t pmd, unsigned long addr, unsigned long end, struct page **pages, int *nr, int write)
+{
+	pte_t *ptep;
+
+	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
+	do {
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return 0;
+
+		if (write && !pte_write(pte))
+			return 0;
+
+		page = pte_page(pte);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep);
+
+	return 1;
+}
+
+static int gup_pmd_range(struct mm_struct *mm, pud_t pud, unsigned long addr, unsigned long end, struct page **pages, int *nr, int write)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		/* if (pte_huge(pmd)) {...} */
+		if (!gup_pte_range(mm, pmd, addr, next, pages, nr, write))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static unsigned long gup_pud_range(struct mm_struct *mm, pgd_t pgd, unsigned long addr, unsigned long end, struct page **pages, int *nr, int write)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(mm, pud, addr, next, pages, nr, write))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long addr, unsigned long end, int flags, struct page **pages, int nr, int write)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long next;
+	pgd_t *pgdp;
+
+	/* XXX: batch / limit 'nr', to avoid huge latency */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on x86-64. XXX: hugepages!
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86-64), we can follow the address down to the
+	 * the page.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(mm, pgd, addr, next, pages, &nr, write))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	return nr;
+}

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-11 11:39           ` Nick Piggin
@ 2007-10-12 20:34             ` Siddha, Suresh B
  2007-10-13 23:27               ` Nick Piggin
  0 siblings, 1 reply; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-12 20:34 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ken Chen, Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

On Thu, Oct 11, 2007 at 04:39:51AM -0700, Nick Piggin wrote:
> Attached is the really basic sketch of how it will work. Any
> party poopers care tell me why I'm an idiot? :)

I tried to be a party pooper but no. This sounds like a good idea as you
are banking on the 'mm' being the 'active mm'.

sounds like two birds in one shot, I think.

On ia64, we have "tpa" instruction which does the virtual to physical
address conversion for us. But talking to Tony, that will fault during not
present or vhpt misses.

Well, for now, manual walk is probably the best we have.

thanks,
suresh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-12 20:34             ` Siddha, Suresh B
@ 2007-10-13 23:27               ` Nick Piggin
  2007-10-14  1:01                 ` [rfc] lockless get_user_pages for dio (and more) Nick Piggin
  2007-10-14 15:42                 ` [rfc] more granular page table lock for hugepages Siddha, Suresh B
  0 siblings, 2 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-13 23:27 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Ken Chen, Badari Pulavarty, linux-mm, tony.luck

On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> On Thu, Oct 11, 2007 at 04:39:51AM -0700, Nick Piggin wrote:
> > Attached is the really basic sketch of how it will work. Any
> > party poopers care tell me why I'm an idiot? :)
>
> I tried to be a party pooper but no. This sounds like a good idea as you
> are banking on the 'mm' being the 'active mm'.

Yeah, I think that's the common case, and definitely required for
this lockless path to work.


> sounds like two birds in one shot, I think.

OK, I'll flesh it out a bit more and see if I can actually get
something working (and working with hugepages too).


> On ia64, we have "tpa" instruction which does the virtual to physical
> address conversion for us. But talking to Tony, that will fault during not
> present or vhpt misses.
>
> Well, for now, manual walk is probably the best we have.

Hmm, we'd actually want it to fault, and go through the full
handle_mm_fault path if possible, and somehow just give an
-EFAULT if it can't be satisfied. The common case will be that
a mapping does actually exist, but sometimes there won't be a
pte entry... depending on the application, it may even be the
common case to have a hot TLB entry too... I don't know,
obviously the manual walk is needed to get a simple baseline.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [rfc] lockless get_user_pages for dio (and more)
  2007-10-13 23:27               ` Nick Piggin
@ 2007-10-14  1:01                 ` Nick Piggin
  2007-10-14 18:19                   ` Siddha, Suresh B
  2007-10-14 15:42                 ` [rfc] more granular page table lock for hugepages Siddha, Suresh B
  1 sibling, 1 reply; 30+ messages in thread
From: Nick Piggin @ 2007-10-14  1:01 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Ken Chen, Badari Pulavarty, linux-mm, tony.luck

[-- Attachment #1: Type: text/plain, Size: 629 bytes --]

On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:

> > sounds like two birds in one shot, I think.
>
> OK, I'll flesh it out a bit more and see if I can actually get
> something working (and working with hugepages too).

This is just a really quick hack, untested ATM, but one that
has at least a chance of working (on x86).

I don't know if I've got the hugepage walk exactly right,
because I've never really done much practical work on that
side of things.

Hmm, I guess we also want some instrumentation to ensure that
we aren't often dropping into the slowpath.

[-- Attachment #2: mm-get_user_pages-fast.patch --]
[-- Type: text/x-diff, Size: 9917 bytes --]

Introduce a new "fast_gup" (for want of a better name right now) which
is basically a get_user_pages with a less general API that is more suited
to the common case.

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This allows (at least on x86), an optimistic lockless pagetable walk,
without taking any page table locks or even mmap_sem. Page table existence
is guaranteed by turning interrupts off (combined with the fact that we're
always looking up the current mm, which would need an IPI before its
pagetables could be shot down from another CPU).

Many other architectures could do the same thing. Those that don't IPI
could potentially RCU free the page tables and do speculative references
on the pages (a la lockless pagecache) to achieve a lockless fast_gup.


---
Index: linux-2.6/arch/x86/lib/Makefile_64
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile_64
+++ linux-2.6/arch/x86/lib/Makefile_64
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup.o
Index: linux-2.6/arch/x86/lib/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/lib/gup.c
@@ -0,0 +1,144 @@
+/*
+ * Lockless fast_gup for x86
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+
+	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
+	do {
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return 0;
+
+		if (write && !pte_write(pte))
+			return 0;
+
+		page = pte_page(pte);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep);
+
+	return 1;
+}
+
+static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t pte = *(pte_t *)&pmd;
+
+	if (write && !pte_write(pte))
+		return 0;
+
+	do {
+		unsigned long pfn_offset;
+		struct page *page;
+
+		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(pte) + pfn_offset;
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (pmd_large(pmd))
+			gup_huge_pmd(pmd, addr, next, write, pages, nr);
+		else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long start, unsigned long end, int write,
+		struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/* XXX: batch / limit 'nr', to avoid huge latency */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on x86-64. XXX: hugepages!
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86-64), we can follow the address down to the
+	 * the page.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+slow:
+	{
+		int ret;
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start, (end - start) >> PAGE_SHIFT,
+					write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+		return ret;
+	}
+}
Index: linux-2.6/include/asm-x86/uaccess_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/uaccess_64.h
+++ linux-2.6/include/asm-x86/uaccess_64.h
@@ -381,4 +381,8 @@ static inline int __copy_from_user_inato
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+struct page;
+int fast_gup(unsigned long start, unsigned long end, int write,
+		struct page **pages);
+
 #endif /* __X86_64_UACCESS_H */
Index: linux-2.6/fs/bio.c
===================================================================
--- linux-2.6.orig/fs/bio.c
+++ linux-2.6/fs/bio.c
@@ -646,12 +646,8 @@ static struct bio *__bio_map_user_iov(st
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = fast_gup(uaddr, local_nr_pages << PAGE_SHIFT, write_to_vm,
+				&pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -221,10 +221,8 @@ static struct page *blk_get_page(unsigne
 	if (pvec->idx == pvec->nr) {
 		nr_pages = PAGES_SPANNED(addr, count);
 		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
+		ret = fast_gup(addr, nr_pages << PAGE_SHIFT, rw == READ,
+				pvec->page);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		pvec->nr = ret;
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c
+++ linux-2.6/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = fast_gup(
 		dio->curr_user_address,		/* Where from? */
-		nr_pages,			/* How many pages? */
+		nr_pages << PAGE_SHIFT,		/* Where to? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -1224,33 +1224,6 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1263,8 +1236,6 @@ static int get_iovec_page_array(const st
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1273,7 +1244,7 @@ static int get_iovec_page_array(const st
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1307,9 +1278,8 @@ static int get_iovec_page_array(const st
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = fast_gup((unsigned long) base, npages << PAGE_SHIFT, 0,
+				       &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1348,8 +1318,6 @@ static int get_iovec_page_array(const st
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-14  1:01                 ` [rfc] lockless get_user_pages for dio (and more) Nick Piggin
@ 2007-10-14 18:19                   ` Siddha, Suresh B
  2007-10-15  4:15                     ` Nick Piggin
  2007-10-15 12:25                     ` Nick Piggin
  0 siblings, 2 replies; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-14 18:19 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck

On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:
> On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> > On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> 
> > > sounds like two birds in one shot, I think.
> >
> > OK, I'll flesh it out a bit more and see if I can actually get
> > something working (and working with hugepages too).
> 
> This is just a really quick hack, untested ATM, but one that
> has at least a chance of working (on x86).

When we fall back to slow mode, we should decrement the ref counts
on the pages we got so far in the fast mode.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-14 18:19                   ` Siddha, Suresh B
@ 2007-10-15  4:15                     ` Nick Piggin
  2007-10-15 12:25                     ` Nick Piggin
  1 sibling, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-15  4:15 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Ken Chen, Badari Pulavarty, linux-mm, tony.luck

On Monday 15 October 2007 04:19, Siddha, Suresh B wrote:
> On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:
> > On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> > > On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> > > > sounds like two birds in one shot, I think.
> > >
> > > OK, I'll flesh it out a bit more and see if I can actually get
> > > something working (and working with hugepages too).
> >
> > This is just a really quick hack, untested ATM, but one that
> > has at least a chance of working (on x86).
>
> When we fall back to slow mode, we should decrement the ref counts
> on the pages we got so far in the fast mode.

Oops, you're right of course!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-14 18:19                   ` Siddha, Suresh B
  2007-10-15  4:15                     ` Nick Piggin
@ 2007-10-15 12:25                     ` Nick Piggin
  2007-10-15 17:03                       ` Badari Pulavarty
                                         ` (3 more replies)
  1 sibling, 4 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-15 12:25 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Ken Chen, Badari Pulavarty, linux-mm, tony.luck

[-- Attachment #1: Type: text/plain, Size: 1386 bytes --]

On Monday 15 October 2007 04:19, Siddha, Suresh B wrote:
> On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:
> > On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> > > On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> > > > sounds like two birds in one shot, I think.
> > >
> > > OK, I'll flesh it out a bit more and see if I can actually get
> > > something working (and working with hugepages too).
> >
> > This is just a really quick hack, untested ATM, but one that
> > has at least a chance of working (on x86).
>
> When we fall back to slow mode, we should decrement the ref counts
> on the pages we got so far in the fast mode.

Here is something that is actually tested and works (not
tested with hugepages yet, though).

However it's not 100% secure at the moment. It's actually
not completely trivial; I think we need to use an extra bit
in the present pte in order to exclude "not normal" pages,
if we want fast_gup to work on small page mappings too. I
think this would be possible to do on most architectures, but
I haven't done it here obviously.

Still, it should be enough to test the design. I've added
fast_gup and fast_gup_slow to /proc/vmstat, which count the
number of times fast_gup was called, and the number of times
it dropped into the slowpath. It would be interesting to know
how it performs compared to your granular hugepage ptl...

[-- Attachment #2: mm-get_user_pages-fast.patch --]
[-- Type: text/x-diff, Size: 10432 bytes --]

Introduce a new "fast_gup" (for want of a better name right now) which
is basically a get_user_pages with a less general API that is more suited
to the common case.

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This allows (at least on x86), an optimistic lockless pagetable walk,
without taking any page table locks or even mmap_sem. Page table existence
is guaranteed by turning interrupts off (combined with the fact that we're
always looking up the current mm, which would need an IPI before its
pagetables could be shot down from another CPU).

Many other architectures could do the same thing. Those that don't IPI
could potentially RCU free the page tables and do speculative references
on the pages (a la lockless pagecache) to achieve a lockless fast_gup.


---
Index: linux-2.6/arch/x86/lib/Makefile_64
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile_64
+++ linux-2.6/arch/x86/lib/Makefile_64
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup.o
Index: linux-2.6/arch/x86/lib/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/lib/gup.c
@@ -0,0 +1,168 @@
+/*
+ * Lockless fast_gup for x86
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+
+	/* XXX: this won't work for 32-bit (must map pte) */
+	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
+	do {
+		pte_t pte = *ptep;
+		unsigned long pfn;
+		struct page *page;
+
+		if ((pte_val(pte) & (_PAGE_PRESENT|_PAGE_USER)) != (_PAGE_PRESENT|_PAGE_USER))
+			return 0;
+
+		if (write && !pte_write(pte))
+			return 0;
+
+		/* XXX: really need new bit in pte to denote normal page */
+		pfn = pte_pfn(pte);
+		if (unlikely(!pfn_valid(pfn)))
+			return 0;
+
+		page = pfn_to_page(pfn);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep - 1);
+
+	return 1;
+}
+
+static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t pte = *(pte_t *)&pmd;
+	struct page *page;
+
+	if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
+		return 0;
+
+	BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	if (write && !pte_write(pte))
+		return 0;
+
+	page = pte_page(pte);
+	do {
+		unsigned long pfn_offset;
+		struct page *p;
+
+		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		p = page + pfn_offset;
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_large(pmd))) {
+			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+				return 0;
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long end = start + (nr_pages << PAGE_SHIFT);
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/* XXX: batch / limit 'nr', to avoid huge latency */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on x86-64.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86-64), we can follow the address down to the
+	 * the page.
+	 */
+	local_irq_disable();
+	__count_vm_event(FAST_GUP);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+slow:
+	{
+		int i, ret;
+
+		__count_vm_event(FAST_GUP_SLOW);
+		local_irq_enable();
+		for (i = 0; i < nr; i++)
+			put_page(pages[i]);
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		return ret;
+	}
+}
Index: linux-2.6/include/asm-x86/uaccess_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/uaccess_64.h
+++ linux-2.6/include/asm-x86/uaccess_64.h
@@ -381,4 +381,7 @@ static inline int __copy_from_user_inato
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+struct page;
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages);
+
 #endif /* __X86_64_UACCESS_H */
Index: linux-2.6/fs/bio.c
===================================================================
--- linux-2.6.orig/fs/bio.c
+++ linux-2.6/fs/bio.c
@@ -646,12 +646,7 @@ static struct bio *__bio_map_user_iov(st
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = fast_gup(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -221,10 +221,7 @@ static struct page *blk_get_page(unsigne
 	if (pvec->idx == pvec->nr) {
 		nr_pages = PAGES_SPANNED(addr, count);
 		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
+		ret = fast_gup(addr, nr_pages, rw == READ, pvec->page);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		pvec->nr = ret;
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c
+++ linux-2.6/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = fast_gup(
 		dio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -1224,33 +1224,6 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1263,8 +1236,6 @@ static int get_iovec_page_array(const st
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1273,7 +1244,7 @@ static int get_iovec_page_array(const st
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1307,9 +1278,7 @@ static int get_iovec_page_array(const st
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = fast_gup((unsigned long)base, npages, 0, &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1348,8 +1317,6 @@ static int get_iovec_page_array(const st
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 12:25                     ` Nick Piggin
@ 2007-10-15 17:03                       ` Badari Pulavarty
  2007-10-15 17:49                         ` Siddha, Suresh B
  2007-10-15 17:54                       ` Siddha, Suresh B
                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 30+ messages in thread
From: Badari Pulavarty @ 2007-10-15 17:03 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Siddha, Suresh B, Ken Chen, linux-mm, tony.luck

On Mon, 2007-10-15 at 22:25 +1000, Nick Piggin wrote:
..
> Here is something that is actually tested and works (not
> tested with hugepages yet, though).
> 
> However it's not 100% secure at the moment. It's actually
> not completely trivial; I think we need to use an extra bit
> in the present pte in order to exclude "not normal" pages,
> if we want fast_gup to work on small page mappings too. I
> think this would be possible to do on most architectures, but
> I haven't done it here obviously.
> 
> Still, it should be enough to test the design. I've added
> fast_gup and fast_gup_slow to /proc/vmstat, which count the
> number of times fast_gup was called, and the number of times
> it dropped into the slowpath. It would be interesting to know
> how it performs compared to your granular hugepage ptl...


+static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long
end, int write, struct page **pages, int *nr)
+{
+       pte_t pte = *(pte_t *)&pmd;
+       struct page *page;
+
+       if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
+               return 0;
+
+       BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+       if (write && !pte_write(pte))
+               return 0;
+
+       page = pte_page(pte);
+       do {
+               unsigned long pfn_offset;
+               struct page *p;
+
+               pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
+               p = page + pfn_offset;
+               get_page(page);
+               pages[*nr] = page;
+               (*nr)++;
+
+       } while (addr += PAGE_SIZE, addr != end);
                         ^^^^^^^^^^

Shouldn't this be HPAGE_SIZE ?

Thanks,
Badari       

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 17:03                       ` Badari Pulavarty
@ 2007-10-15 17:49                         ` Siddha, Suresh B
  0 siblings, 0 replies; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-15 17:49 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Nick Piggin, Siddha, Suresh B, Ken Chen, linux-mm, tony.luck

On Mon, Oct 15, 2007 at 10:03:52AM -0700, Badari Pulavarty wrote:
> On Mon, 2007-10-15 at 22:25 +1000, Nick Piggin wrote:
> +static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long
> end, int write, struct page **pages, int *nr)
> +{
> +       pte_t pte = *(pte_t *)&pmd;
> +       struct page *page;
> +
> +       if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
> +               return 0;
> +
> +       BUG_ON(!pfn_valid(pte_pfn(pte)));
> +
> +       if (write && !pte_write(pte))
> +               return 0;
> +
> +       page = pte_page(pte);
> +       do {
> +               unsigned long pfn_offset;
> +               struct page *p;
> +
> +               pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
> +               p = page + pfn_offset;
> +               get_page(page);
> +               pages[*nr] = page;
> +               (*nr)++;
> +
> +       } while (addr += PAGE_SIZE, addr != end);
>                          ^^^^^^^^^^
> 
> Shouldn't this be HPAGE_SIZE ?

I think it is compatible with old code. For a compound page, old code
is taking multiple ref counts and populating pages[] with all the individual
pages that make the compound page. Here, we are almost doing the same
thing (I say almost, because in here pages[] are getting populated with
the head page of the compound page. Anyhow routines like put_page that operate
on these pages should work seamlessly whether we use head/tail page).

thanks,
suresh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 12:25                     ` Nick Piggin
  2007-10-15 17:03                       ` Badari Pulavarty
@ 2007-10-15 17:54                       ` Siddha, Suresh B
  2007-10-15 20:21                       ` Ken Chen
  2007-12-10 21:30                       ` Dave Kleikamp
  3 siblings, 0 replies; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-15 17:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	twichell, shaggy

On Mon, Oct 15, 2007 at 10:25:11PM +1000, Nick Piggin wrote:
> On Monday 15 October 2007 04:19, Siddha, Suresh B wrote:
> > On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:
> > > On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> > > > On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> > > > > sounds like two birds in one shot, I think.
> > > >
> > > > OK, I'll flesh it out a bit more and see if I can actually get
> > > > something working (and working with hugepages too).
> > >
> > > This is just a really quick hack, untested ATM, but one that
> > > has at least a chance of working (on x86).
> >
> > When we fall back to slow mode, we should decrement the ref counts
> > on the pages we got so far in the fast mode.
> 
> Here is something that is actually tested and works (not
> tested with hugepages yet, though).
> 
> However it's not 100% secure at the moment. It's actually
> not completely trivial; I think we need to use an extra bit
> in the present pte in order to exclude "not normal" pages,
> if we want fast_gup to work on small page mappings too. I
> think this would be possible to do on most architectures, but
> I haven't done it here obviously.
> 
> Still, it should be enough to test the design. I've added
> fast_gup and fast_gup_slow to /proc/vmstat, which count the
> number of times fast_gup was called, and the number of times
> it dropped into the slowpath. It would be interesting to know
> how it performs compared to your granular hugepage ptl...

I am reasonably sure, it will perform better than mine, as it addresses
the mmap_sem cacheline bouncing also.

I think Brian/Badari can help us out in getting the numbers.

thanks,
suresh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 12:25                     ` Nick Piggin
  2007-10-15 17:03                       ` Badari Pulavarty
  2007-10-15 17:54                       ` Siddha, Suresh B
@ 2007-10-15 20:21                       ` Ken Chen
  2007-10-16  2:15                         ` Nick Piggin
  2007-12-10 21:30                       ` Dave Kleikamp
  3 siblings, 1 reply; 30+ messages in thread
From: Ken Chen @ 2007-10-15 20:21 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

On 10/15/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> +static int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> +{
> +	pte_t pte = *(pte_t *)&pmd;
> +
> +	if (write && !pte_write(pte))
> +		return 0;
> +
> +	do {
> +		unsigned long pfn_offset;
> +		struct page *page;
> +
> +		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
> +		page = pte_page(pte) + pfn_offset;
> +		get_page(page);
> +		pages[*nr] = page;
> +		(*nr)++;
> +
> +	} while (addr += PAGE_SIZE, addr != end);
> +
> +	return 1;
> +}

Since get_page() on compound page will reference back to the head
page, you can take a ref directly against the head page instead of
traversing to tail page and loops around back to the head page.  It is
especially beneficial for large hugetlb page size, i.e., 1 GB page
size so one does not have to pollute cache with tail page's struct
page. I prefer doing the following:

+		page = pte_page(pte);
+		get_page(page);
+		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		pages[*nr] = page + pfn_offset;
+		(*nr)++;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 20:21                       ` Ken Chen
@ 2007-10-16  2:15                         ` Nick Piggin
  2007-10-16  0:14                           ` Dave Hansen
  2007-10-16  3:32                           ` Nick Piggin
  0 siblings, 2 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-16  2:15 UTC (permalink / raw)
  To: Ken Chen; +Cc: Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

On Tuesday 16 October 2007 06:21, Ken Chen wrote:
> On 10/15/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > +static int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> > +{
> > +	pte_t pte = *(pte_t *)&pmd;
> > +
> > +	if (write && !pte_write(pte))
> > +		return 0;
> > +
> > +	do {
> > +		unsigned long pfn_offset;
> > +		struct page *page;
> > +
> > +		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
> > +		page = pte_page(pte) + pfn_offset;
> > +		get_page(page);
> > +		pages[*nr] = page;
> > +		(*nr)++;
> > +
> > +	} while (addr += PAGE_SIZE, addr != end);
> > +
> > +	return 1;
> > +}
>
> Since get_page() on compound page will reference back to the head
> page, you can take a ref directly against the head page instead of
> traversing to tail page and loops around back to the head page.  It is
> especially beneficial for large hugetlb page size, i.e., 1 GB page
> size so one does not have to pollute cache with tail page's struct
> page. I prefer doing the following:
>
> +		page = pte_page(pte);
> +		get_page(page);
> +		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
> +		pages[*nr] = page + pfn_offset;
> +		(*nr)++;

Very good point. Actually we could also possibly optimise this
loop so that all it does is to fill the pages[] array, and then
have a function to increment the head page refcount by "N", thus
reducing atomic operations by a factor of N...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-16  2:15                         ` Nick Piggin
@ 2007-10-16  0:14                           ` Dave Hansen
  2007-10-16  3:26                             ` Nick Piggin
  2007-10-16  3:32                           ` Nick Piggin
  1 sibling, 1 reply; 30+ messages in thread
From: Dave Hansen @ 2007-10-16  0:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ken Chen, Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

> +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
> +{
> +       pte_t *ptep;
> +
> +       /* XXX: this won't work for 32-bit (must map pte) */
> +       ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
> +       do {
> +               pte_t pte = *ptep;
> +               unsigned long pfn;
> +               struct page *page;
> +
> +               if ((pte_val(pte) & (_PAGE_PRESENT|_PAGE_USER)) != (_PAGE_PRESENT|_PAGE_USER))
> +                       return 0;
> +
> +               if (write && !pte_write(pte))
> +                       return 0;
> +
> +               /* XXX: really need new bit in pte to denote normal page */
> +               pfn = pte_pfn(pte);
> +               if (unlikely(!pfn_valid(pfn)))
> +                       return 0;

Is that little pfn_valid() nugget to help detect VM_IO and VM_PFNMAP
areas?  Does that work 100% of the time?  Is it for anything else?

If that is all that you want a bit in the pte for, I guess we could get
a bitfield or a simple flag in the mm to say whether there are any
VM_IO/PFNMAP areas around.  If we used the same IPI/RCU rules as
pagetables to manage such a flag, I think it would be sufficient to dump
us into the slow path when we hit those areas.  

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-16  0:14                           ` Dave Hansen
@ 2007-10-16  3:26                             ` Nick Piggin
  0 siblings, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-16  3:26 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ken Chen, Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

On Tuesday 16 October 2007 10:14, Dave Hansen wrote:
> > +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long
> > end, int write, struct page **pages, int *nr) +{
> > +       pte_t *ptep;
> > +
> > +       /* XXX: this won't work for 32-bit (must map pte) */
> > +       ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
> > +       do {
> > +               pte_t pte = *ptep;
> > +               unsigned long pfn;
> > +               struct page *page;
> > +
> > +               if ((pte_val(pte) & (_PAGE_PRESENT|_PAGE_USER)) !=
> > (_PAGE_PRESENT|_PAGE_USER)) +                       return 0;
> > +
> > +               if (write && !pte_write(pte))
> > +                       return 0;
> > +
> > +               /* XXX: really need new bit in pte to denote normal page
> > */ +               pfn = pte_pfn(pte);
> > +               if (unlikely(!pfn_valid(pfn)))
> > +                       return 0;
>
> Is that little pfn_valid() nugget to help detect VM_IO and VM_PFNMAP
> areas?

Yes.

> Does that work 100% of the time?

No, because we can mmap /dev/mem for example and point to valid
pfns, but it would be a bug to take a ref on them.


> Is it for anything else?  
>
> If that is all that you want a bit in the pte for, I guess we could get
> a bitfield or a simple flag in the mm to say whether there are any
> VM_IO/PFNMAP areas around.  If we used the same IPI/RCU rules as
> pagetables to manage such a flag, I think it would be sufficient to dump
> us into the slow path when we hit those areas.

I don't see any problem with using a new bit in the pte. We kind of
wanted to do this to simplify some of the COW rules in the core VM
anyway. I think s390 doesn't have any spare bits, so I suppose that
guy could implement said flag if they want a fast_gup as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-16  2:15                         ` Nick Piggin
  2007-10-16  0:14                           ` Dave Hansen
@ 2007-10-16  3:32                           ` Nick Piggin
  1 sibling, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-16  3:32 UTC (permalink / raw)
  To: Ken Chen; +Cc: Siddha, Suresh B, Badari Pulavarty, linux-mm, tony.luck

On Tuesday 16 October 2007 12:15, Nick Piggin wrote:
> On Tuesday 16 October 2007 06:21, Ken Chen wrote:

> > Since get_page() on compound page will reference back to the head
> > page, you can take a ref directly against the head page instead of
> > traversing to tail page and loops around back to the head page.  It is
> > especially beneficial for large hugetlb page size, i.e., 1 GB page
> > size so one does not have to pollute cache with tail page's struct
> > page. I prefer doing the following:
> >
> > +		page = pte_page(pte);
> > +		get_page(page);
> > +		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
> > +		pages[*nr] = page + pfn_offset;
> > +		(*nr)++;
>
> Very good point. Actually we could also possibly optimise this
> loop so that all it does is to fill the pages[] array, and then
> have a function to increment the head page refcount by "N", thus
> reducing atomic operations by a factor of N...

This is what I've ended up with... it should be extremely fast
to get a large number of pages out of a hugepage.

static inline void get_head_page_multiple(struct page *page, int nr)
{
        VM_BUG_ON(page != compound_head(page));
        VM_BUG_ON(page_count(page) == 0);
        atomic_add(nr, &page->_count);
}

static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
        pte_t pte = *(pte_t *)&pmd;
        struct page *head, *page;
        int refs;

        if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
                return 0;

        BUG_ON(!pfn_valid(pte_pfn(pte)));

        if (write && !pte_write(pte))
                return 0;

        refs = 0;
        head = pte_page(pte);
        page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
        do {
                pages[*nr] = page;
                (*nr)++;
                page++;
                refs++;
        } while (addr += PAGE_SIZE, addr != end);
        get_head_page_multiple(head, refs);

        return 1;
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-10-15 12:25                     ` Nick Piggin
                                         ` (2 preceding siblings ...)
  2007-10-15 20:21                       ` Ken Chen
@ 2007-12-10 21:30                       ` Dave Kleikamp
  2007-12-12  4:57                         ` Nick Piggin
  3 siblings, 1 reply; 30+ messages in thread
From: Dave Kleikamp @ 2007-12-10 21:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

On Mon, 2007-10-15 at 22:25 +1000, Nick Piggin wrote:
> On Monday 15 October 2007 04:19, Siddha, Suresh B wrote:
> > On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:

> > > This is just a really quick hack, untested ATM, but one that
> > > has at least a chance of working (on x86).
> >
> > When we fall back to slow mode, we should decrement the ref counts
> > on the pages we got so far in the fast mode.
> 
> Here is something that is actually tested and works (not
> tested with hugepages yet, though).
> 
> However it's not 100% secure at the moment. It's actually
> not completely trivial; I think we need to use an extra bit
> in the present pte in order to exclude "not normal" pages,
> if we want fast_gup to work on small page mappings too. I
> think this would be possible to do on most architectures, but
> I haven't done it here obviously.
> 
> Still, it should be enough to test the design. I've added
> fast_gup and fast_gup_slow to /proc/vmstat, which count the
> number of times fast_gup was called, and the number of times
> it dropped into the slowpath. It would be interesting to know
> how it performs compared to your granular hugepage ptl...

Nick,
I've played with the fast_gup patch a bit.  I was able to find a problem
in follow_hugetlb_page() that Adam Litke fixed.  I'm haven't been brave
enough to implement it on any other architectures, but I did add  a
default that takes mmap_sem and calls the normal get_user_pages() if the
architecture doesn't define fast_gup().  I put it in linux/mm.h, for
lack of a better place, but it's a little kludgy since I didn't want
mm.h to have to include sched.h.  This patch is against 2.6.24-rc4.
It's not ready for inclusion yet, of course.

I haven't done much benchmarking.  The one test I was looking at didn't
show much of a change.

 ==============================================
Introduce a new "fast_gup" (for want of a better name right now) which
is basically a get_user_pages with a less general API that is more suited
to the common case.

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This allows (at least on x86), an optimistic lockless pagetable walk,
without taking any page table locks or even mmap_sem. Page table existence
is guaranteed by turning interrupts off (combined with the fact that we're
always looking up the current mm, which would need an IPI before its
pagetables could be shot down from another CPU).

Many other architectures could do the same thing. Those that don't IPI
could potentially RCU free the page tables and do speculative references
on the pages (a la lockless pagecache) to achieve a lockless fast_gup.

Originally by Nick Piggin <nickpiggin@yahoo.com.au>
---
 arch/x86/lib/Makefile_64     |    2 
 arch/x86/lib/gup_64.c        |  188 +++++++++++++++++++++++++++++++++++++++++++
 fs/bio.c                     |    8 -
 fs/block_dev.c               |    5 -
 fs/direct-io.c               |   10 --
 fs/splice.c                  |   38 --------
 include/asm-x86/uaccess_64.h |    4 
 include/linux/mm.h           |   26 +++++
 include/linux/vmstat.h       |    1 
 mm/vmstat.c                  |    3 
 10 files changed, 231 insertions(+), 54 deletions(-)

diff -Nurp linux-2.6.24-rc4/arch/x86/lib/Makefile_64 linux/arch/x86/lib/Makefile_64
--- linux-2.6.24-rc4/arch/x86/lib/Makefile_64	2007-12-04 08:44:34.000000000 -0600
+++ linux/arch/x86/lib/Makefile_64	2007-12-10 15:01:17.000000000 -0600
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup_64.o
diff -Nurp linux-2.6.24-rc4/arch/x86/lib/gup_64.c linux/arch/x86/lib/gup_64.c
--- linux-2.6.24-rc4/arch/x86/lib/gup_64.c	1969-12-31 18:00:00.000000000 -0600
+++ linux/arch/x86/lib/gup_64.c	2007-12-10 15:01:17.000000000 -0600
@@ -0,0 +1,188 @@
+/*
+ * Lockless fast_gup for x86
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+			 int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+
+	/* XXX: this won't work for 32-bit (must map pte) */
+	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
+	do {
+		pte_t pte = *ptep;
+		unsigned long pfn;
+		struct page *page;
+
+		if ((pte_val(pte) & (_PAGE_PRESENT|_PAGE_USER)) !=
+		    (_PAGE_PRESENT|_PAGE_USER))
+			return 0;
+
+		if (write && !pte_write(pte))
+			return 0;
+
+		/* XXX: really need new bit in pte to denote normal page */
+		pfn = pte_pfn(pte);
+		if (unlikely(!pfn_valid(pfn)))
+			return 0;
+
+		page = pfn_to_page(pfn);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep - 1);
+
+	return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+	VM_BUG_ON(page != compound_head(page));
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_add(nr, &page->_count);
+}
+
+static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
+			int write, struct page **pages, int *nr)
+{
+	pte_t pte = *(pte_t *)&pmd;
+	struct page *head, *page;
+	int refs;
+
+	if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
+		return 0;
+
+	BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	if (write && !pte_write(pte))
+		return 0;
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+	do {
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	get_head_page_multiple(head, refs);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+			 int write, struct page **pages, int *nr)
+{
+	static int count = 50;
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_large(pmd))) {
+			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) {
+				if (count) {
+					printk(KERN_ERR
+						"pmd = 0x%lx, addr = 0x%lx\n",
+						pmd.pmd, addr);
+					count--;
+				}
+				return 0;
+			}
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+			 int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long end = start + (nr_pages << PAGE_SHIFT);
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/* XXX: batch / limit 'nr', to avoid huge latency */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on x86-64.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86-64), we can follow the address down to the
+	 * the page.
+	 */
+	local_irq_disable();
+	__count_vm_event(FAST_GUP);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+slow:
+	{
+		int i, ret;
+
+		__count_vm_event(FAST_GUP_SLOW);
+		local_irq_enable();
+		for (i = 0; i < nr; i++)
+			put_page(pages[i]);
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		return ret;
+	}
+}
diff -Nurp linux-2.6.24-rc4/fs/bio.c linux/fs/bio.c
--- linux-2.6.24-rc4/fs/bio.c	2007-12-04 08:44:49.000000000 -0600
+++ linux/fs/bio.c	2007-12-10 15:01:17.000000000 -0600
@@ -636,13 +636,9 @@ static struct bio *__bio_map_user_iov(st
 		unsigned long start = uaddr >> PAGE_SHIFT;
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
-		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
 
+		ret = fast_gup(uaddr, local_nr_pages, write_to_vm,
+			       &pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
diff -Nurp linux-2.6.24-rc4/fs/block_dev.c linux/fs/block_dev.c
--- linux-2.6.24-rc4/fs/block_dev.c	2007-12-04 08:44:49.000000000 -0600
+++ linux/fs/block_dev.c	2007-12-10 15:01:17.000000000 -0600
@@ -221,10 +221,7 @@ static struct page *blk_get_page(unsigne
 	if (pvec->idx == pvec->nr) {
 		nr_pages = PAGES_SPANNED(addr, count);
 		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
+		ret = fast_gup(addr, nr_pages, rw == READ, pvec->page);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		pvec->nr = ret;
diff -Nurp linux-2.6.24-rc4/fs/direct-io.c linux/fs/direct-io.c
--- linux-2.6.24-rc4/fs/direct-io.c	2007-12-04 08:44:49.000000000 -0600
+++ linux/fs/direct-io.c	2007-12-10 15:01:17.000000000 -0600
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = fast_gup(
 		dio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
diff -Nurp linux-2.6.24-rc4/fs/splice.c linux/fs/splice.c
--- linux-2.6.24-rc4/fs/splice.c	2007-12-04 08:44:50.000000000 -0600
+++ linux/fs/splice.c	2007-12-10 15:01:17.000000000 -0600
@@ -1174,33 +1174,6 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1213,8 +1186,6 @@ static int get_iovec_page_array(const st
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1223,7 +1194,7 @@ static int get_iovec_page_array(const st
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1257,9 +1228,8 @@ static int get_iovec_page_array(const st
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = fast_gup((unsigned long)base, npages, 0,
+				 &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1298,8 +1268,6 @@ static int get_iovec_page_array(const st
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 
diff -Nurp linux-2.6.24-rc4/include/asm-x86/uaccess_64.h linux/include/asm-x86/uaccess_64.h
--- linux-2.6.24-rc4/include/asm-x86/uaccess_64.h	2007-12-04 08:44:54.000000000 -0600
+++ linux/include/asm-x86/uaccess_64.h	2007-12-10 15:01:17.000000000 -0600
@@ -381,4 +381,8 @@ static inline int __copy_from_user_inato
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+#define __HAVE_ARCH_FAST_GUP
+struct page;
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages);
+
 #endif /* __X86_64_UACCESS_H */
diff -Nurp linux-2.6.24-rc4/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.6.24-rc4/include/linux/mm.h	2007-12-04 08:44:56.000000000 -0600
+++ linux/include/linux/mm.h	2007-12-10 15:01:17.000000000 -0600
@@ -12,6 +12,7 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
+#include <linux/uaccess.h>	/* for __HAVE_ARCH_FAST_GUP */
 
 struct mempolicy;
 struct anon_vma;
@@ -750,6 +751,31 @@ extern int mprotect_fixup(struct vm_area
 			  unsigned long end, unsigned long newflags);
 
 /*
+ * Architecture may implement efficient get_user_pages to avoid having to
+ * take the mmap sem
+ */
+#ifndef __HAVE_ARCH_FAST_GUP
+static inline int __fast_gup(struct mm_struct *mm, unsigned long start,
+			     int nr_pages, int write, struct page **pages)
+{
+	int ret;
+
+	down_read(&mm->mmap_sem);
+	ret = get_user_pages(current, mm, start, nr_pages, write,
+			     0, pages, NULL);
+	up_read(&mm->mmap_sem);
+
+	return ret;
+}
+/*
+ * This macro avoids having to include sched.h in this header to
+ * dereference current->mm.
+ */
+#define fast_gup(start, nr_pages, write, pages) \
+	__fast_gup(current->mm, start, nr_pages, write, pages)
+#endif
+
+/*
  * A callback you can register to apply pressure to ageable caches.
  *
  * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
diff -Nurp linux-2.6.24-rc4/include/linux/vmstat.h linux/include/linux/vmstat.h
--- linux-2.6.24-rc4/include/linux/vmstat.h	2007-10-09 15:31:38.000000000 -0500
+++ linux/include/linux/vmstat.h	2007-12-10 15:01:17.000000000 -0600
@@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+		FAST_GUP, FAST_GUP_SLOW,
 		NR_VM_EVENT_ITEMS
 };
 
diff -Nurp linux-2.6.24-rc4/mm/vmstat.c linux/mm/vmstat.c
--- linux-2.6.24-rc4/mm/vmstat.c	2007-12-04 08:45:01.000000000 -0600
+++ linux/mm/vmstat.c	2007-12-10 15:01:17.000000000 -0600
@@ -642,6 +642,9 @@ static const char * const vmstat_text[] 
 	"allocstall",
 
 	"pgrotated",
+	"fast_gup",
+	"fast_gup_slow",
+
 #endif
 };
 

-- 
David Kleikamp
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-12-10 21:30                       ` Dave Kleikamp
@ 2007-12-12  4:57                         ` Nick Piggin
  2007-12-12  5:11                           ` Dave Kleikamp
  0 siblings, 1 reply; 30+ messages in thread
From: Nick Piggin @ 2007-12-12  4:57 UTC (permalink / raw)
  To: Dave Kleikamp
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

On Tuesday 11 December 2007 08:30, Dave Kleikamp wrote:
> On Mon, 2007-10-15 at 22:25 +1000, Nick Piggin wrote:
> > On Monday 15 October 2007 04:19, Siddha, Suresh B wrote:
> > > On Sun, Oct 14, 2007 at 11:01:02AM +1000, Nick Piggin wrote:
> > > > This is just a really quick hack, untested ATM, but one that
> > > > has at least a chance of working (on x86).
> > >
> > > When we fall back to slow mode, we should decrement the ref counts
> > > on the pages we got so far in the fast mode.
> >
> > Here is something that is actually tested and works (not
> > tested with hugepages yet, though).
> >
> > However it's not 100% secure at the moment. It's actually
> > not completely trivial; I think we need to use an extra bit
> > in the present pte in order to exclude "not normal" pages,
> > if we want fast_gup to work on small page mappings too. I
> > think this would be possible to do on most architectures, but
> > I haven't done it here obviously.
> >
> > Still, it should be enough to test the design. I've added
> > fast_gup and fast_gup_slow to /proc/vmstat, which count the
> > number of times fast_gup was called, and the number of times
> > it dropped into the slowpath. It would be interesting to know
> > how it performs compared to your granular hugepage ptl...
>
> Nick,
> I've played with the fast_gup patch a bit.  I was able to find a problem
> in follow_hugetlb_page() that Adam Litke fixed.  I'm haven't been brave
> enough to implement it on any other architectures, but I did add  a
> default that takes mmap_sem and calls the normal get_user_pages() if the
> architecture doesn't define fast_gup().  I put it in linux/mm.h, for
> lack of a better place, but it's a little kludgy since I didn't want
> mm.h to have to include sched.h.  This patch is against 2.6.24-rc4.
> It's not ready for inclusion yet, of course.

Hi Dave,

Thanks so much. This makes it much more a complete patch (although
still missing the "normal page" detection).

I think I missed -- or forgot -- what was the follow_hugetlb_page
problem?

Anyway, I am hoping that someone will one day and test if this and
find it helps their workload, but on the other hand, if it doesn't
help anyone then we don't have to worry about adding it to the
kernel ;) I don't have any real setups that hammers DIO with threads.
I'm guessing DB2 and/or Oracle does?

Thanks,
Nick

>
> I haven't done much benchmarking.  The one test I was looking at didn't
> show much of a change.
>
>  ==============================================
> Introduce a new "fast_gup" (for want of a better name right now) which
> is basically a get_user_pages with a less general API that is more suited
> to the common case.
>
> - task and mm are always current and current->mm
> - force is always 0
> - pages is always non-NULL
> - don't pass back vmas
>
> This allows (at least on x86), an optimistic lockless pagetable walk,
> without taking any page table locks or even mmap_sem. Page table existence
> is guaranteed by turning interrupts off (combined with the fact that we're
> always looking up the current mm, which would need an IPI before its
> pagetables could be shot down from another CPU).
>
> Many other architectures could do the same thing. Those that don't IPI
> could potentially RCU free the page tables and do speculative references
> on the pages (a la lockless pagecache) to achieve a lockless fast_gup.
>
> Originally by Nick Piggin <nickpiggin@yahoo.com.au>
> ---
>  arch/x86/lib/Makefile_64     |    2
>  arch/x86/lib/gup_64.c        |  188
> +++++++++++++++++++++++++++++++++++++++++++ fs/bio.c                     | 
>   8 -
>  fs/block_dev.c               |    5 -
>  fs/direct-io.c               |   10 --
>  fs/splice.c                  |   38 --------
>  include/asm-x86/uaccess_64.h |    4
>  include/linux/mm.h           |   26 +++++
>  include/linux/vmstat.h       |    1
>  mm/vmstat.c                  |    3
>  10 files changed, 231 insertions(+), 54 deletions(-)
>
> diff -Nurp linux-2.6.24-rc4/arch/x86/lib/Makefile_64
> linux/arch/x86/lib/Makefile_64 ---
> linux-2.6.24-rc4/arch/x86/lib/Makefile_64	2007-12-04 08:44:34.000000000
> -0600 +++ linux/arch/x86/lib/Makefile_64	2007-12-10 15:01:17.000000000
> -0600 @@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
>  lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
>  	usercopy_64.o getuser_64.o putuser_64.o  \
>  	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
> -lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o
> copy_user_nocache_64.o +lib-y += memcpy_64.o memmove_64.o memset_64.o
> copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup_64.o diff -Nurp
> linux-2.6.24-rc4/arch/x86/lib/gup_64.c linux/arch/x86/lib/gup_64.c ---
> linux-2.6.24-rc4/arch/x86/lib/gup_64.c	1969-12-31 18:00:00.000000000 -0600
> +++ linux/arch/x86/lib/gup_64.c	2007-12-10 15:01:17.000000000 -0600 @@ -0,0
> +1,188 @@
> +/*
> + * Lockless fast_gup for x86
> + *
> + * Copyright (C) 2007 Nick Piggin
> + * Copyright (C) 2007 Novell Inc.
> + */
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/vmstat.h>
> +#include <asm/pgtable.h>
> +
> +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> +			 int write, struct page **pages, int *nr)
> +{
> +	pte_t *ptep;
> +
> +	/* XXX: this won't work for 32-bit (must map pte) */
> +	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
> +	do {
> +		pte_t pte = *ptep;
> +		unsigned long pfn;
> +		struct page *page;
> +
> +		if ((pte_val(pte) & (_PAGE_PRESENT|_PAGE_USER)) !=
> +		    (_PAGE_PRESENT|_PAGE_USER))
> +			return 0;
> +
> +		if (write && !pte_write(pte))
> +			return 0;
> +
> +		/* XXX: really need new bit in pte to denote normal page */
> +		pfn = pte_pfn(pte);
> +		if (unlikely(!pfn_valid(pfn)))
> +			return 0;
> +
> +		page = pfn_to_page(pfn);
> +		get_page(page);
> +		pages[*nr] = page;
> +		(*nr)++;
> +
> +	} while (ptep++, addr += PAGE_SIZE, addr != end);
> +	pte_unmap(ptep - 1);
> +
> +	return 1;
> +}
> +
> +static inline void get_head_page_multiple(struct page *page, int nr)
> +{
> +	VM_BUG_ON(page != compound_head(page));
> +	VM_BUG_ON(page_count(page) == 0);
> +	atomic_add(nr, &page->_count);
> +}
> +
> +static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
> +			int write, struct page **pages, int *nr)
> +{
> +	pte_t pte = *(pte_t *)&pmd;
> +	struct page *head, *page;
> +	int refs;
> +
> +	if ((pte_val(pte) & _PAGE_USER) != _PAGE_USER)
> +		return 0;
> +
> +	BUG_ON(!pfn_valid(pte_pfn(pte)));
> +
> +	if (write && !pte_write(pte))
> +		return 0;
> +
> +	refs = 0;
> +	head = pte_page(pte);
> +	page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
> +	do {
> +		pages[*nr] = page;
> +		(*nr)++;
> +		page++;
> +		refs++;
> +	} while (addr += PAGE_SIZE, addr != end);
> +
> +	get_head_page_multiple(head, refs);
> +
> +	return 1;
> +}
> +
> +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> +			 int write, struct page **pages, int *nr)
> +{
> +	static int count = 50;
> +	unsigned long next;
> +	pmd_t *pmdp;
> +
> +	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
> +	do {
> +		pmd_t pmd = *pmdp;
> +
> +		next = pmd_addr_end(addr, end);
> +		if (pmd_none(pmd))
> +			return 0;
> +		if (unlikely(pmd_large(pmd))) {
> +			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) {
> +				if (count) {
> +					printk(KERN_ERR
> +						"pmd = 0x%lx, addr = 0x%lx\n",
> +						pmd.pmd, addr);
> +					count--;
> +				}
> +				return 0;
> +			}
> +		} else {
> +			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> +				return 0;
> +		}
> +	} while (pmdp++, addr = next, addr != end);
> +
> +	return 1;
> +}
> +
> +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> +			 int write, struct page **pages, int *nr)
> +{
> +	unsigned long next;
> +	pud_t *pudp;
> +
> +	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
> +	do {
> +		pud_t pud = *pudp;
> +
> +		next = pud_addr_end(addr, end);
> +		if (pud_none(pud))
> +			return 0;
> +		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> +			return 0;
> +	} while (pudp++, addr = next, addr != end);
> +
> +	return 1;
> +}
> +
> +int fast_gup(unsigned long start, int nr_pages, int write, struct page
> **pages) +{
> +	struct mm_struct *mm = current->mm;
> +	unsigned long end = start + (nr_pages << PAGE_SHIFT);
> +	unsigned long addr = start;
> +	unsigned long next;
> +	pgd_t *pgdp;
> +	int nr = 0;
> +
> +	/* XXX: batch / limit 'nr', to avoid huge latency */
> +	/*
> +	 * This doesn't prevent pagetable teardown, but does prevent
> +	 * the pagetables from being freed on x86-64.
> +	 *
> +	 * So long as we atomically load page table pointers versus teardown
> +	 * (which we do on x86-64), we can follow the address down to the
> +	 * the page.
> +	 */
> +	local_irq_disable();
> +	__count_vm_event(FAST_GUP);
> +	pgdp = pgd_offset(mm, addr);
> +	do {
> +		pgd_t pgd = *pgdp;
> +
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none(pgd))
> +			goto slow;
> +		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> +			goto slow;
> +	} while (pgdp++, addr = next, addr != end);
> +	local_irq_enable();
> +
> +	BUG_ON(nr != (end - start) >> PAGE_SHIFT);
> +	return nr;
> +
> +slow:
> +	{
> +		int i, ret;
> +
> +		__count_vm_event(FAST_GUP_SLOW);
> +		local_irq_enable();
> +		for (i = 0; i < nr; i++)
> +			put_page(pages[i]);
> +
> +		down_read(&mm->mmap_sem);
> +		ret = get_user_pages(current, mm, start,
> +			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
> +		up_read(&mm->mmap_sem);
> +
> +		return ret;
> +	}
> +}
> diff -Nurp linux-2.6.24-rc4/fs/bio.c linux/fs/bio.c
> --- linux-2.6.24-rc4/fs/bio.c	2007-12-04 08:44:49.000000000 -0600
> +++ linux/fs/bio.c	2007-12-10 15:01:17.000000000 -0600
> @@ -636,13 +636,9 @@ static struct bio *__bio_map_user_iov(st
>  		unsigned long start = uaddr >> PAGE_SHIFT;
>  		const int local_nr_pages = end - start;
>  		const int page_limit = cur_page + local_nr_pages;
> -
> -		down_read(&current->mm->mmap_sem);
> -		ret = get_user_pages(current, current->mm, uaddr,
> -				     local_nr_pages,
> -				     write_to_vm, 0, &pages[cur_page], NULL);
> -		up_read(&current->mm->mmap_sem);
>
> +		ret = fast_gup(uaddr, local_nr_pages, write_to_vm,
> +			       &pages[cur_page]);
>  		if (ret < local_nr_pages) {
>  			ret = -EFAULT;
>  			goto out_unmap;
> diff -Nurp linux-2.6.24-rc4/fs/block_dev.c linux/fs/block_dev.c
> --- linux-2.6.24-rc4/fs/block_dev.c	2007-12-04 08:44:49.000000000 -0600
> +++ linux/fs/block_dev.c	2007-12-10 15:01:17.000000000 -0600
> @@ -221,10 +221,7 @@ static struct page *blk_get_page(unsigne
>  	if (pvec->idx == pvec->nr) {
>  		nr_pages = PAGES_SPANNED(addr, count);
>  		nr_pages = min(nr_pages, VEC_SIZE);
> -		down_read(&current->mm->mmap_sem);
> -		ret = get_user_pages(current, current->mm, addr, nr_pages,
> -				     rw == READ, 0, pvec->page, NULL);
> -		up_read(&current->mm->mmap_sem);
> +		ret = fast_gup(addr, nr_pages, rw == READ, pvec->page);
>  		if (ret < 0)
>  			return ERR_PTR(ret);
>  		pvec->nr = ret;
> diff -Nurp linux-2.6.24-rc4/fs/direct-io.c linux/fs/direct-io.c
> --- linux-2.6.24-rc4/fs/direct-io.c	2007-12-04 08:44:49.000000000 -0600
> +++ linux/fs/direct-io.c	2007-12-10 15:01:17.000000000 -0600
> @@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
>  	int nr_pages;
>
>  	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
> -	down_read(&current->mm->mmap_sem);
> -	ret = get_user_pages(
> -		current,			/* Task for fault acounting */
> -		current->mm,			/* whose pages? */
> +	ret = fast_gup(
>  		dio->curr_user_address,		/* Where from? */
>  		nr_pages,			/* How many pages? */
>  		dio->rw == READ,		/* Write to memory? */
> -		0,				/* force (?) */
> -		&dio->pages[0],
> -		NULL);				/* vmas */
> -	up_read(&current->mm->mmap_sem);
> +		&dio->pages[0]);		/* Put results here */
>
>  	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
>  		struct page *page = ZERO_PAGE(0);
> diff -Nurp linux-2.6.24-rc4/fs/splice.c linux/fs/splice.c
> --- linux-2.6.24-rc4/fs/splice.c	2007-12-04 08:44:50.000000000 -0600
> +++ linux/fs/splice.c	2007-12-10 15:01:17.000000000 -0600
> @@ -1174,33 +1174,6 @@ static long do_splice(struct file *in, l
>  }
>
>  /*
> - * Do a copy-from-user while holding the mmap_semaphore for reading, in a
> - * manner safe from deadlocking with simultaneous mmap() (grabbing
> mmap_sem - * for writing) and page faulting on the user memory pointed to
> by src. - * This assumes that we will very rarely hit the partial != 0
> path, or this - * will not be a win.
> - */
> -static int copy_from_user_mmap_sem(void *dst, const void __user *src,
> size_t n) -{
> -	int partial;
> -
> -	pagefault_disable();
> -	partial = __copy_from_user_inatomic(dst, src, n);
> -	pagefault_enable();
> -
> -	/*
> -	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
> -	 */
> -	if (unlikely(partial)) {
> -		up_read(&current->mm->mmap_sem);
> -		partial = copy_from_user(dst, src, n);
> -		down_read(&current->mm->mmap_sem);
> -	}
> -
> -	return partial;
> -}
> -
> -/*
>   * Map an iov into an array of pages and offset/length tupples. With the
>   * partial_page structure, we can map several non-contiguous ranges into
>   * our ones pages[] map instead of splitting that operation into pieces.
> @@ -1213,8 +1186,6 @@ static int get_iovec_page_array(const st
>  {
>  	int buffers = 0, error = 0;
>
> -	down_read(&current->mm->mmap_sem);
> -
>  	while (nr_vecs) {
>  		unsigned long off, npages;
>  		struct iovec entry;
> @@ -1223,7 +1194,7 @@ static int get_iovec_page_array(const st
>  		int i;
>
>  		error = -EFAULT;
> -		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
> +		if (copy_from_user(&entry, iov, sizeof(entry)))
>  			break;
>
>  		base = entry.iov_base;
> @@ -1257,9 +1228,8 @@ static int get_iovec_page_array(const st
>  		if (npages > PIPE_BUFFERS - buffers)
>  			npages = PIPE_BUFFERS - buffers;
>
> -		error = get_user_pages(current, current->mm,
> -				       (unsigned long) base, npages, 0, 0,
> -				       &pages[buffers], NULL);
> +		error = fast_gup((unsigned long)base, npages, 0,
> +				 &pages[buffers]);
>
>  		if (unlikely(error <= 0))
>  			break;
> @@ -1298,8 +1268,6 @@ static int get_iovec_page_array(const st
>  		iov++;
>  	}
>
> -	up_read(&current->mm->mmap_sem);
> -
>  	if (buffers)
>  		return buffers;
>
> diff -Nurp linux-2.6.24-rc4/include/asm-x86/uaccess_64.h
> linux/include/asm-x86/uaccess_64.h ---
> linux-2.6.24-rc4/include/asm-x86/uaccess_64.h	2007-12-04 08:44:54.000000000
> -0600 +++ linux/include/asm-x86/uaccess_64.h	2007-12-10 15:01:17.000000000
> -0600 @@ -381,4 +381,8 @@ static inline int __copy_from_user_inato
>  	return __copy_user_nocache(dst, src, size, 0);
>  }
>
> +#define __HAVE_ARCH_FAST_GUP
> +struct page;
> +int fast_gup(unsigned long start, int nr_pages, int write, struct page
> **pages); +
>  #endif /* __X86_64_UACCESS_H */
> diff -Nurp linux-2.6.24-rc4/include/linux/mm.h linux/include/linux/mm.h
> --- linux-2.6.24-rc4/include/linux/mm.h	2007-12-04 08:44:56.000000000 -0600
> +++ linux/include/linux/mm.h	2007-12-10 15:01:17.000000000 -0600
> @@ -12,6 +12,7 @@
>  #include <linux/prio_tree.h>
>  #include <linux/debug_locks.h>
>  #include <linux/mm_types.h>
> +#include <linux/uaccess.h>	/* for __HAVE_ARCH_FAST_GUP */
>
>  struct mempolicy;
>  struct anon_vma;
> @@ -750,6 +751,31 @@ extern int mprotect_fixup(struct vm_area
>  			  unsigned long end, unsigned long newflags);
>
>  /*
> + * Architecture may implement efficient get_user_pages to avoid having to
> + * take the mmap sem
> + */
> +#ifndef __HAVE_ARCH_FAST_GUP
> +static inline int __fast_gup(struct mm_struct *mm, unsigned long start,
> +			     int nr_pages, int write, struct page **pages)
> +{
> +	int ret;
> +
> +	down_read(&mm->mmap_sem);
> +	ret = get_user_pages(current, mm, start, nr_pages, write,
> +			     0, pages, NULL);
> +	up_read(&mm->mmap_sem);
> +
> +	return ret;
> +}
> +/*
> + * This macro avoids having to include sched.h in this header to
> + * dereference current->mm.
> + */
> +#define fast_gup(start, nr_pages, write, pages) \
> +	__fast_gup(current->mm, start, nr_pages, write, pages)
> +#endif
> +
> +/*
>   * A callback you can register to apply pressure to ageable caches.
>   *
>   * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
> diff -Nurp linux-2.6.24-rc4/include/linux/vmstat.h
> linux/include/linux/vmstat.h ---
> linux-2.6.24-rc4/include/linux/vmstat.h	2007-10-09 15:31:38.000000000 -0500
> +++ linux/include/linux/vmstat.h	2007-12-10 15:01:17.000000000 -0600 @@
> -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
>  		FOR_ALL_ZONES(PGSCAN_DIRECT),
>  		PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
>  		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
> +		FAST_GUP, FAST_GUP_SLOW,
>  		NR_VM_EVENT_ITEMS
>  };
>
> diff -Nurp linux-2.6.24-rc4/mm/vmstat.c linux/mm/vmstat.c
> --- linux-2.6.24-rc4/mm/vmstat.c	2007-12-04 08:45:01.000000000 -0600
> +++ linux/mm/vmstat.c	2007-12-10 15:01:17.000000000 -0600
> @@ -642,6 +642,9 @@ static const char * const vmstat_text[]
>  	"allocstall",
>
>  	"pgrotated",
> +	"fast_gup",
> +	"fast_gup_slow",
> +
>  #endif
>  };

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-12-12  4:57                         ` Nick Piggin
@ 2007-12-12  5:11                           ` Dave Kleikamp
  2007-12-12  5:40                             ` Nick Piggin
  0 siblings, 1 reply; 30+ messages in thread
From: Dave Kleikamp @ 2007-12-12  5:11 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

On Wed, 2007-12-12 at 15:57 +1100, Nick Piggin wrote:
> On Tuesday 11 December 2007 08:30, Dave Kleikamp wrote:

> > Nick,
> > I've played with the fast_gup patch a bit.  I was able to find a problem
> > in follow_hugetlb_page() that Adam Litke fixed.  I'm haven't been brave
> > enough to implement it on any other architectures, but I did add  a
> > default that takes mmap_sem and calls the normal get_user_pages() if the
> > architecture doesn't define fast_gup().  I put it in linux/mm.h, for
> > lack of a better place, but it's a little kludgy since I didn't want
> > mm.h to have to include sched.h.  This patch is against 2.6.24-rc4.
> > It's not ready for inclusion yet, of course.
> 
> Hi Dave,
> 
> Thanks so much. This makes it much more a complete patch (although
> still missing the "normal page" detection).
> 
> I think I missed -- or forgot -- what was the follow_hugetlb_page
> problem?

Badari found a problem running some tests and handed it off to me to
look at.  I didn't share it publicly.  Anyway, we were finding that
fastgup was taking the slow path almost all the time with huge pages.
The problem was that follow_hugetlb_page was failing to fault on a
non-writable page when it needed a writable one.  So we'd keep seeing a
non-writable page over and over.  This is fixed in 2.6.24-rc5.

> Anyway, I am hoping that someone will one day and test if this and
> find it helps their workload, but on the other hand, if it doesn't
> help anyone then we don't have to worry about adding it to the
> kernel ;) I don't have any real setups that hammers DIO with threads.
> I'm guessing DB2 and/or Oracle does?

I'll try to get someone to run a DB2 benchmark and see what it looks
like.
-- 
David Kleikamp
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-12-12  5:11                           ` Dave Kleikamp
@ 2007-12-12  5:40                             ` Nick Piggin
  2008-01-16 19:58                               ` Dave Kleikamp
  0 siblings, 1 reply; 30+ messages in thread
From: Nick Piggin @ 2007-12-12  5:40 UTC (permalink / raw)
  To: Dave Kleikamp
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

On Wednesday 12 December 2007 16:11, Dave Kleikamp wrote:
> On Wed, 2007-12-12 at 15:57 +1100, Nick Piggin wrote:
> > On Tuesday 11 December 2007 08:30, Dave Kleikamp wrote:
> > > Nick,
> > > I've played with the fast_gup patch a bit.  I was able to find a
> > > problem in follow_hugetlb_page() that Adam Litke fixed.  I'm haven't
> > > been brave enough to implement it on any other architectures, but I did
> > > add  a default that takes mmap_sem and calls the normal
> > > get_user_pages() if the architecture doesn't define fast_gup().  I put
> > > it in linux/mm.h, for lack of a better place, but it's a little kludgy
> > > since I didn't want mm.h to have to include sched.h.  This patch is
> > > against 2.6.24-rc4. It's not ready for inclusion yet, of course.
> >
> > Hi Dave,
> >
> > Thanks so much. This makes it much more a complete patch (although
> > still missing the "normal page" detection).
> >
> > I think I missed -- or forgot -- what was the follow_hugetlb_page
> > problem?
>
> Badari found a problem running some tests and handed it off to me to
> look at.  I didn't share it publicly.  Anyway, we were finding that
> fastgup was taking the slow path almost all the time with huge pages.
> The problem was that follow_hugetlb_page was failing to fault on a
> non-writable page when it needed a writable one.  So we'd keep seeing a
> non-writable page over and over.  This is fixed in 2.6.24-rc5.

Ah yes, I just saw that fix in the changelog. So not a problem with my
patch as such, but good to get that fixed.


> > Anyway, I am hoping that someone will one day and test if this and
> > find it helps their workload, but on the other hand, if it doesn't
> > help anyone then we don't have to worry about adding it to the
> > kernel ;) I don't have any real setups that hammers DIO with threads.
> > I'm guessing DB2 and/or Oracle does?
>
> I'll try to get someone to run a DB2 benchmark and see what it looks
> like.

That would be great if you could.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2007-12-12  5:40                             ` Nick Piggin
@ 2008-01-16 19:58                               ` Dave Kleikamp
  2008-01-17  6:34                                 ` Nick Piggin
  2008-01-24  7:06                                 ` Nick Piggin
  0 siblings, 2 replies; 30+ messages in thread
From: Dave Kleikamp @ 2008-01-16 19:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

On Wed, 2007-12-12 at 16:40 +1100, Nick Piggin wrote:
> On Wednesday 12 December 2007 16:11, Dave Kleikamp wrote:
> > On Wed, 2007-12-12 at 15:57 +1100, Nick Piggin wrote:

> > > Anyway, I am hoping that someone will one day and test if this and
> > > find it helps their workload, but on the other hand, if it doesn't
> > > help anyone then we don't have to worry about adding it to the
> > > kernel ;) I don't have any real setups that hammers DIO with threads.
> > > I'm guessing DB2 and/or Oracle does?
> >
> > I'll try to get someone to run a DB2 benchmark and see what it looks
> > like.
> 
> That would be great if you could.

We weren't able to get in any runs before the holidays, but we finally
have some good news from our performance team:

"To test the effects of the patch, an OLTP workload was run on an IBM
x3850 M2 server with 2 processors (quad-core Intel Xeon processors at
2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel. Comparing
runs with and without the patch resulted in an overall performance
benefit of ~9.8%. Correspondingly, oprofiles showed that samples from
__up_read and __down_read routines that is seen during thread contention
for system resources was reduced from 2.8% down to .05%. Monitoring
the /proc/vmstat output from the patched run showed that the counter for
fast_gup contained a very high number while the fast_gup_slow value was
zero."

Great work, Nick!

Thanks,
Shaggy
-- 
David Kleikamp
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2008-01-16 19:58                               ` Dave Kleikamp
@ 2008-01-17  6:34                                 ` Nick Piggin
  2008-01-24  7:06                                 ` Nick Piggin
  1 sibling, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2008-01-17  6:34 UTC (permalink / raw)
  To: Dave Kleikamp
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel, linux-arch

On Thursday 17 January 2008 06:58, Dave Kleikamp wrote:
> On Wed, 2007-12-12 at 16:40 +1100, Nick Piggin wrote:
> > On Wednesday 12 December 2007 16:11, Dave Kleikamp wrote:
> > > On Wed, 2007-12-12 at 15:57 +1100, Nick Piggin wrote:
> > > > Anyway, I am hoping that someone will one day and test if this and
> > > > find it helps their workload, but on the other hand, if it doesn't
> > > > help anyone then we don't have to worry about adding it to the
> > > > kernel ;) I don't have any real setups that hammers DIO with threads.
> > > > I'm guessing DB2 and/or Oracle does?
> > >
> > > I'll try to get someone to run a DB2 benchmark and see what it looks
> > > like.
> >
> > That would be great if you could.
>
> We weren't able to get in any runs before the holidays, but we finally
> have some good news from our performance team:
>
> "To test the effects of the patch, an OLTP workload was run on an IBM
> x3850 M2 server with 2 processors (quad-core Intel Xeon processors at
> 2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel. Comparing
> runs with and without the patch resulted in an overall performance
> benefit of ~9.8%. Correspondingly, oprofiles showed that samples from
> __up_read and __down_read routines that is seen during thread contention
> for system resources was reduced from 2.8% down to .05%. Monitoring
> the /proc/vmstat output from the patched run showed that the counter for
> fast_gup contained a very high number while the fast_gup_slow value was
> zero."
>
> Great work, Nick!

Ah, excellent. Thanks for getting those numbers Dave. This will
be a great help towards getting the patch merged.

I'm just working on the final required piece for this thing (the
pte_special pte bit, required to distinguish whether or not we
can refcount a page without looking at the vma). It is strictly
just a correctness/security measure, which is why you were able
to run tests without it. And it won't add any significant cost to
the fastpaths, so the numbers remain valid.

FWIW, I cc'ed linux-arch: the lockless get_user_pages patch has
architecture specific elements, so it will need some attention
there. If other architectures are interested (eg. powerpc or
ia64), then I will be happy to work with maintainers to help
try to devise a way of fitting it into their tlb flushing scheme.
Ping me if you'd like to take up the offer.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] lockless get_user_pages for dio (and more)
  2008-01-16 19:58                               ` Dave Kleikamp
  2008-01-17  6:34                                 ` Nick Piggin
@ 2008-01-24  7:06                                 ` Nick Piggin
  1 sibling, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2008-01-24  7:06 UTC (permalink / raw)
  To: Dave Kleikamp
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck,
	Adam Litke, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1491 bytes --]

On Thursday 17 January 2008 06:58, Dave Kleikamp wrote:

> We weren't able to get in any runs before the holidays, but we finally
> have some good news from our performance team:
>
> "To test the effects of the patch, an OLTP workload was run on an IBM
> x3850 M2 server with 2 processors (quad-core Intel Xeon processors at
> 2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel. Comparing
> runs with and without the patch resulted in an overall performance
> benefit of ~9.8%. Correspondingly, oprofiles showed that samples from
> __up_read and __down_read routines that is seen during thread contention
> for system resources was reduced from 2.8% down to .05%. Monitoring
> the /proc/vmstat output from the patched run showed that the counter for
> fast_gup contained a very high number while the fast_gup_slow value was
> zero."

Just for reference, I've attached a more complete patch for x86,
which has to be applied on top of the pte_special patch posted in
another thread.

No need to test anything at this point... the generated code for
this version is actually slightly better than the last one despite
the extra condition being tested for. With a few tweak I was
actually able to reduce the number of tests in the inner loop, and
adding noinline to the leaf functions helps keep them in registers.

I'm currently having a look at an initial powerpc 64 patch,
hopefully we'll see similar improvements there. Will post that when
I get further along with it.

Thanks,
Nick

[-- Attachment #2: mm-get_user_pages-fast.patch --]
[-- Type: text/x-diff, Size: 12491 bytes --]

Introduce a new "fast_gup" (for want of a better name right now) which
is basically a get_user_pages with a less general API that is more suited
to the common case.

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This allows (at least on x86), an optimistic lockless pagetable walk,
without taking any page table locks or even mmap_sem. Page table existence
is guaranteed by turning interrupts off (combined with the fact that we're
always looking up the current mm, which would need an IPI before its
pagetables could be shot down from another CPU).

Many other architectures could do the same thing. Those that don't IPI
could potentially RCU free the page tables and do speculative references
on the pages (a la lockless pagecache) to achieve a lockless fast_gup.


---
Index: linux-2.6/arch/x86/lib/Makefile_64
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile_64
+++ linux-2.6/arch/x86/lib/Makefile_64
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup.o
Index: linux-2.6/arch/x86/lib/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/lib/gup.c
@@ -0,0 +1,189 @@
+/*
+ * Lockless fast_gup for x86
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	result = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		result |= _PAGE_RW;
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_map(&pmd, addr);
+	do {
+		/*
+		 * XXX: careful. On 3-level 32-bit, the pte is 64 bits, and
+		 * we need to make sure we load the low word first, then the
+		 * high. This means _PAGE_PRESENT should be clear if the high
+		 * word was not valid. Currently, the C compiler can issue
+		 * the loads in any order, and I don't know of a wrapper
+		 * function that will do this properly, so it is broken on
+		 * 32-bit 3-level for the moment.
+		 */
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep - 1);
+
+	return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+	VM_BUG_ON(page != compound_head(page));
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_add(nr, &page->_count);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t pte = *(pte_t *)&pmd;
+	struct page *head, *page;
+	int refs;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+	do {
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+	get_head_page_multiple(head, refs);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_large(pmd))) {
+			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+				return 0;
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long end = start + (nr_pages << PAGE_SHIFT);
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid huge latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86), we can follow the address down to the the
+	 * page and take a ref on it.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+slow:
+	{
+		int i, ret;
+
+		local_irq_enable();
+		/* Could optimise this more by keeping what we've already got */
+		for (i = 0; i < nr; i++)
+			put_page(pages[i]);
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		return ret;
+	}
+}
Index: linux-2.6/include/asm-x86/uaccess_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/uaccess_64.h
+++ linux-2.6/include/asm-x86/uaccess_64.h
@@ -381,4 +381,8 @@ static inline int __copy_from_user_inato
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+#define __HAVE_ARCH_FAST_GUP
+struct page;
+int fast_gup(unsigned long start, int nr_pages, int write, struct page **pages);
+
 #endif /* __X86_64_UACCESS_H */
Index: linux-2.6/fs/bio.c
===================================================================
--- linux-2.6.orig/fs/bio.c
+++ linux-2.6/fs/bio.c
@@ -637,12 +637,7 @@ static struct bio *__bio_map_user_iov(st
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = fast_gup(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -221,10 +221,7 @@ static struct page *blk_get_page(unsigne
 	if (pvec->idx == pvec->nr) {
 		nr_pages = PAGES_SPANNED(addr, count);
 		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
+		ret = fast_gup(addr, nr_pages, rw == READ, pvec->page);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		pvec->nr = ret;
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c
+++ linux-2.6/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = fast_gup(
 		dio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -1174,33 +1174,6 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1213,8 +1186,6 @@ static int get_iovec_page_array(const st
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1223,7 +1194,7 @@ static int get_iovec_page_array(const st
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1257,9 +1228,7 @@ static int get_iovec_page_array(const st
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = fast_gup((unsigned long)base, npages, 0, &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1298,8 +1267,6 @@ static int get_iovec_page_array(const st
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
 #include <linux/security.h>
+#include <linux/uaccess.h> /* for __HAVE_ARCH_FAST_GUP */
 
 struct mempolicy;
 struct anon_vma;
@@ -767,6 +768,24 @@ extern int mprotect_fixup(struct vm_area
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
 
+#ifndef __HAVE_ARCH_FAST_GUP
+/* Should be moved to asm-generic, and architectures can include it if they
+ * don't implement their own fast_gup.
+ */
+#define fast_gup(start, nr_pages, write, pages)			\
+({								\
+	struct mm_struct *mm = current->mm;			\
+	int ret;						\
+								\
+	down_read(&mm->mmap_sem);				\
+	ret = get_user_pages(current, mm, start, nr_pages,	\
+					write, 0, pages, NULL);	\
+	up_read(&mm->mmap_sem);					\
+								\
+	ret;							\
+})
+#endif
+
 /*
  * A callback you can register to apply pressure to ageable caches.
  *

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-13 23:27               ` Nick Piggin
  2007-10-14  1:01                 ` [rfc] lockless get_user_pages for dio (and more) Nick Piggin
@ 2007-10-14 15:42                 ` Siddha, Suresh B
  2007-10-15  4:17                   ` Nick Piggin
  1 sibling, 1 reply; 30+ messages in thread
From: Siddha, Suresh B @ 2007-10-14 15:42 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Siddha, Suresh B, Ken Chen, Badari Pulavarty, linux-mm, tony.luck

On Sun, Oct 14, 2007 at 09:27:46AM +1000, Nick Piggin wrote:
> On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> > On ia64, we have "tpa" instruction which does the virtual to physical
> > address conversion for us. But talking to Tony, that will fault during not
> > present or vhpt misses.
> >
> > Well, for now, manual walk is probably the best we have.
> 
> Hmm, we'd actually want it to fault, and go through the full
> handle_mm_fault path if possible, and somehow just give an

But, this walk was happening with interrupts disabled. So the best will be
to have a peek at the page tables without faulting and the peek can comeback
and say, sorry, you have to go through slowest path.

Anyhow, lets first make sure that no one else has any major issues with
the simplest solution first :)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [rfc] more granular page table lock for hugepages
  2007-10-14 15:42                 ` [rfc] more granular page table lock for hugepages Siddha, Suresh B
@ 2007-10-15  4:17                   ` Nick Piggin
  0 siblings, 0 replies; 30+ messages in thread
From: Nick Piggin @ 2007-10-15  4:17 UTC (permalink / raw)
  To: Siddha, Suresh B; +Cc: Ken Chen, Badari Pulavarty, linux-mm, tony.luck

On Monday 15 October 2007 01:42, Siddha, Suresh B wrote:
> On Sun, Oct 14, 2007 at 09:27:46AM +1000, Nick Piggin wrote:
> > On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:
> > > On ia64, we have "tpa" instruction which does the virtual to physical
> > > address conversion for us. But talking to Tony, that will fault during
> > > not present or vhpt misses.
> > >
> > > Well, for now, manual walk is probably the best we have.
> >
> > Hmm, we'd actually want it to fault, and go through the full
> > handle_mm_fault path if possible, and somehow just give an
>
> But, this walk was happening with interrupts disabled. So the best will be
> to have a peek at the page tables without faulting and the peek can
> comeback and say, sorry, you have to go through slowest path.

Oh yeah, you're right of course. I guess it is probably better to
do that way anyway: either way we have to take mmap_sem, and once
taken, it is probably better to hold it and batch up the rest of
the operations in the slowpath.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2008-01-24  7:06 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-08 22:52 [rfc] more granular page table lock for hugepages Siddha, Suresh B
2007-10-09 20:23 ` Ken Chen
2007-10-09 21:05   ` Badari Pulavarty
2007-10-10  0:15     ` Siddha, Suresh B
2007-10-10  6:10       ` Ken Chen
2007-10-10  7:50         ` Ken Chen
2007-10-11 11:39           ` Nick Piggin
2007-10-12 20:34             ` Siddha, Suresh B
2007-10-13 23:27               ` Nick Piggin
2007-10-14  1:01                 ` [rfc] lockless get_user_pages for dio (and more) Nick Piggin
2007-10-14 18:19                   ` Siddha, Suresh B
2007-10-15  4:15                     ` Nick Piggin
2007-10-15 12:25                     ` Nick Piggin
2007-10-15 17:03                       ` Badari Pulavarty
2007-10-15 17:49                         ` Siddha, Suresh B
2007-10-15 17:54                       ` Siddha, Suresh B
2007-10-15 20:21                       ` Ken Chen
2007-10-16  2:15                         ` Nick Piggin
2007-10-16  0:14                           ` Dave Hansen
2007-10-16  3:26                             ` Nick Piggin
2007-10-16  3:32                           ` Nick Piggin
2007-12-10 21:30                       ` Dave Kleikamp
2007-12-12  4:57                         ` Nick Piggin
2007-12-12  5:11                           ` Dave Kleikamp
2007-12-12  5:40                             ` Nick Piggin
2008-01-16 19:58                               ` Dave Kleikamp
2008-01-17  6:34                                 ` Nick Piggin
2008-01-24  7:06                                 ` Nick Piggin
2007-10-14 15:42                 ` [rfc] more granular page table lock for hugepages Siddha, Suresh B
2007-10-15  4:17                   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).