[RFC] Demand faulting for large pages

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] Demand faulting for large pages
@ 2005-08-05 15:21 Adam Litke
  2005-08-05 15:53 ` Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Adam Litke @ 2005-08-05 15:21 UTC (permalink / raw)
  To: linux-kernel; +Cc: ak, christoph, dwg

Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that
huge page allocations can follow the NUMA API.  Currently, huge pages
are allocated round-robin from all NUMA nodes.   

The default behavior in SLES9 for i386 is to use demand faulting with
NUMA policy-aware allocations.  To my knowledge, this continues to work
well in practice.  Thanks to consolidated hugetlb code, switching the
behavior requires changing only one fault handler.  The bulk of the
patch just moves the logic from hugelb_prefault() to
hugetlb_pte_fault().

Diffed against 2.6.13-rc4-git4

Signed-off-by: Adam Litke <agl@us.ibm.com>

 fs/hugetlbfs/inode.c    |    5 -
 include/linux/hugetlb.h |    2 
 mm/hugetlb.c            |  140 +++++++++++++++++++++++++++---------------------
 mm/memory.c             |    7 --
 4 files changed, 83 insertions(+), 71 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -79,10 +79,7 @@ static int hugetlbfs_file_mmap(struct fi
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	ret = hugetlb_prefault(mapping, vma);
-	if (ret)
-		goto out;
-
+	ret = 0;
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+			unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -277,18 +277,20 @@ int copy_hugetlb_page_range(struct mm_st
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
 
-	while (addr < end) {
+	for (; addr < end; addr += HPAGE_SIZE) {
+		src_pte = huge_pte_offset(src, addr);
+		if (!src_pte || pte_none(*src_pte))
+			continue;
+		
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+		BUG_ON(!src_pte);
 		entry = *src_pte;
 		ptepage = pte_page(entry);
 		get_page(ptepage);
 		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(dst, addr, dst_pte, entry);
-		addr += HPAGE_SIZE;
 	}
 	return 0;
 
@@ -329,63 +331,6 @@ void zap_hugepage_range(struct vm_area_s
 	spin_unlock(&mm->page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	hugetlb_prefault_arch_hook(mm);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (! pte_none(*pte))
-			hugetlb_clean_stale_pgtable(pte);
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
@@ -433,3 +378,76 @@ int follow_hugetlb_page(struct mm_struct
 
 	return i;
 }
+
+int hugetlb_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	int ret = VM_FAULT_MINOR;
+	unsigned long idx;
+	pte_t *pte;
+	struct page *page;
+	struct address_space *mapping;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+	BUG_ON(!vma->vm_file);
+
+	pte = huge_pte_alloc(mm, address);
+	if (!pte) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+	if (! pte_none(*pte))
+		goto flush;
+
+	mapping = vma->vm_file->f_mapping;
+	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+retry:
+	page = find_get_page(mapping, idx);
+	if (!page) {
+		/* charge the fs quota first */
+		if (hugetlb_get_quota(mapping)) {
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		page = alloc_huge_page();
+		if (!page) {
+			hugetlb_put_quota(mapping);
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+		if(add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {
+			put_page(page);
+			goto retry;
+		}
+		unlock_page(page);
+	}
+	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+flush:
+	flush_tlb_page(vma, address);
+out:
+	return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	int rc = VM_FAULT_SIGBUS;
+
+	spin_lock(&mm->page_table_lock);
+
+	ptep = huge_pte_alloc(mm, address & HPAGE_MASK);
+	if (! ptep) {
+		BUG();
+		goto out;
+	}
+	if (pte_none(*ptep))
+		rc = hugetlb_pte_fault(mm, vma, address, write_access);
+out:
+	spin_unlock(&mm->page_table_lock);
+	return rc;
+}
diff -upN reference/mm/memory.c current/mm/memory.c
--- reference/mm/memory.c
+++ current/mm/memory.c
@@ -933,11 +933,6 @@ int get_user_pages(struct task_struct *t
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
-		if (is_vm_hugetlb_page(vma)) {
-			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &len, i);
-			continue;
-		}
 		spin_lock(&mm->page_table_lock);
 		do {
 			struct page *page;
@@ -2024,7 +2019,7 @@ int handle_mm_fault(struct mm_struct *mm
 	inc_page_state(pgfault);
 
 	if (is_vm_hugetlb_page(vma))
-		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
+		return hugetlb_fault(mm, vma, address, write_access);
 
 	/*
 	 * We need the page table lock to synchronize with kswapd

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 15:21 [RFC] Demand faulting for large pages Adam Litke
@ 2005-08-05 15:53 ` Andi Kleen
  2005-08-05 16:37   ` Adam Litke
  2005-08-05 21:05 ` Chen, Kenneth W
  2005-08-05 21:33 ` Chen, Kenneth W
  2 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2005-08-05 15:53 UTC (permalink / raw)
  To: Adam Litke; +Cc: linux-kernel, ak, christoph, dwg

On Fri, Aug 05, 2005 at 10:21:38AM -0500, Adam Litke wrote:
> Below is a patch to implement demand faulting for huge pages.  The main
> motivation for changing from prefaulting to demand faulting is so that
> huge page allocations can follow the NUMA API.  Currently, huge pages
> are allocated round-robin from all NUMA nodes.   

I think matching DEFAULT is better than having a different default for
huge pages than for small pages.

In general more programs are happy with local memory than remote memory.

Also it makes it consistent.

> 
> The default behavior in SLES9 for i386 is to use demand faulting with
> NUMA policy-aware allocations.  To my knowledge, this continues to work

Not sure what you're trying to say here. All allocations are NUMA policy aware.

> well in practice.  Thanks to consolidated hugetlb code, switching the
> behavior requires changing only one fault handler.  The bulk of the
> patch just moves the logic from hugelb_prefault() to
> hugetlb_pte_fault().

Are you sure you fixed get_user_pages to handle this properly? It doesn't
like it.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 15:53 ` Andi Kleen
@ 2005-08-05 16:37   ` Adam Litke
  2005-08-05 16:47     ` Andi Kleen
  0 siblings, 1 reply; 13+ messages in thread
From: Adam Litke @ 2005-08-05 16:37 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, christoph, dwg

On Fri, 2005-08-05 at 10:53, Andi Kleen wrote:
> On Fri, Aug 05, 2005 at 10:21:38AM -0500, Adam Litke wrote:
> > Below is a patch to implement demand faulting for huge pages.  The main
> > motivation for changing from prefaulting to demand faulting is so that
> > huge page allocations can follow the NUMA API.  Currently, huge pages
> > are allocated round-robin from all NUMA nodes.   
> 
> I think matching DEFAULT is better than having a different default for
> huge pages than for small pages.

I am not exactly sure what the above means.  Is 'DEFAULT' a system
default numa allocation policy?

> In general more programs are happy with local memory than remote memory.

I totally agree.

> Also it makes it consistent.
> 
> > 
> > The default behavior in SLES9 for i386 is to use demand faulting with
> > NUMA policy-aware allocations.  To my knowledge, this continues to work
> 
> Not sure what you're trying to say here. All allocations are NUMA policy aware.

Sorry, I really wasn't clear.  That statement referred to huge pages
specifically.  I was trying to point out that numa policy-aware huge
page allocation combined with demand faulting in SLES9/i386 has been a
success.

> > well in practice.  Thanks to consolidated hugetlb code, switching the
> > behavior requires changing only one fault handler.  The bulk of the
> > patch just moves the logic from hugelb_prefault() to
> > hugetlb_pte_fault().
> 
> Are you sure you fixed get_user_pages to handle this properly? It doesn't
> like it.

Unless I am missing something, the call to follow_hugetlb_page() in
get_user_pages() is just an optimization.  Removing it means
follow_page() will be called individually for each PAGE_SIZE page in the
huge page.  We can probably do better but I didn't want to cloud this
patch with that logic.

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 16:37   ` Adam Litke
@ 2005-08-05 16:47     ` Andi Kleen
  2005-08-05 17:00       ` Adam Litke
  2005-08-05 17:09       ` Christoph Lameter
  0 siblings, 2 replies; 13+ messages in thread
From: Andi Kleen @ 2005-08-05 16:47 UTC (permalink / raw)
  To: Adam Litke; +Cc: Andi Kleen, linux-kernel, christoph, dwg

On Fri, Aug 05, 2005 at 11:37:27AM -0500, Adam Litke wrote:
> On Fri, 2005-08-05 at 10:53, Andi Kleen wrote:
> > On Fri, Aug 05, 2005 at 10:21:38AM -0500, Adam Litke wrote:
> > > Below is a patch to implement demand faulting for huge pages.  The main
> > > motivation for changing from prefaulting to demand faulting is so that
> > > huge page allocations can follow the NUMA API.  Currently, huge pages
> > > are allocated round-robin from all NUMA nodes.   
> > 
> > I think matching DEFAULT is better than having a different default for
> > huge pages than for small pages.
> 
> I am not exactly sure what the above means.  Is 'DEFAULT' a system
> default numa allocation policy?

It's one of the four numa policies: DEFAULT, PREFERED, INTERLEAVE, BIND

It just means allocate on the local node if possible, otherwise fall back.

You said you wanted INTERLEAVE by default, which i think is a bad idea.
It should be only optional like in all other allocations.


> > > patch just moves the logic from hugelb_prefault() to
> > > hugetlb_pte_fault().
> > 
> > Are you sure you fixed get_user_pages to handle this properly? It doesn't
> > like it.
> 
> Unless I am missing something, the call to follow_hugetlb_page() in
> get_user_pages() is just an optimization.  Removing it means
> follow_page() will be called individually for each PAGE_SIZE page in the
> huge page.  We can probably do better but I didn't want to cloud this
> patch with that logic.

The problem is that get_user_pages needs to handle the case of a large
page not yet being faulted in properly. The SLES9 implementation did
some changes for this.

You don't change it at all, so I'm suspect it doesn't work yet.

It's a common case - think people doing raw IO on huge pages shared memory.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 16:47     ` Andi Kleen
@ 2005-08-05 17:00       ` Adam Litke
  2005-08-05 17:12         ` Andi Kleen
  2005-08-05 17:09       ` Christoph Lameter
  1 sibling, 1 reply; 13+ messages in thread
From: Adam Litke @ 2005-08-05 17:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, christoph, dwg

On Fri, 2005-08-05 at 11:47, Andi Kleen wrote:
> On Fri, Aug 05, 2005 at 11:37:27AM -0500, Adam Litke wrote:
> > On Fri, 2005-08-05 at 10:53, Andi Kleen wrote:
> > > On Fri, Aug 05, 2005 at 10:21:38AM -0500, Adam Litke wrote:
> > > > Below is a patch to implement demand faulting for huge pages.  The main
> > > > motivation for changing from prefaulting to demand faulting is so that
> > > > huge page allocations can follow the NUMA API.  Currently, huge pages
> > > > are allocated round-robin from all NUMA nodes.   
> > > 
> > > I think matching DEFAULT is better than having a different default for
> > > huge pages than for small pages.
> > 
> > I am not exactly sure what the above means.  Is 'DEFAULT' a system
> > default numa allocation policy?
> 
> It's one of the four numa policies: DEFAULT, PREFERED, INTERLEAVE, BIND
> 
> It just means allocate on the local node if possible, otherwise fall back.
> 
> You said you wanted INTERLEAVE by default, which i think is a bad idea.
> It should be only optional like in all other allocations.

I tried to say that allocations are _currently_ INTERLEAVE (aka
round-robin) but that I want it to be configurable.  So I think we are
in agreement here.

> > > > patch just moves the logic from hugelb_prefault() to
> > > > hugetlb_pte_fault().
> > > 
> > > Are you sure you fixed get_user_pages to handle this properly? It doesn't
> > > like it.
> > 
> > Unless I am missing something, the call to follow_hugetlb_page() in
> > get_user_pages() is just an optimization.  Removing it means
> > follow_page() will be called individually for each PAGE_SIZE page in the
> > huge page.  We can probably do better but I didn't want to cloud this
> > patch with that logic.
> 
> The problem is that get_user_pages needs to handle the case of a large
> page not yet being faulted in properly. The SLES9 implementation did
> some changes for this.
> 
> You don't change it at all, so I'm suspect it doesn't work yet.

What about:
--- reference/mm/memory.c
+++ current/mm/memory.c
@@ -933,11 +933,6 @@ int get_user_pages(struct task_struct *t
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
-		if (is_vm_hugetlb_page(vma)) {
-			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &len, i);
-			continue;
-		}
 		spin_lock(&mm->page_table_lock);
 		do {
 			struct page *page;

> It's a common case - think people doing raw IO on huge pages shared memory.

My Direct IO test seemed to work fine, but I'll give this a closer look
to make sure follow_huge_{addr|pmd} never return a page for an unfaulted
hugetlb page.  Thanks for your close scrutiny and comments. 

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 17:00       ` Adam Litke
@ 2005-08-05 17:12         ` Andi Kleen
  0 siblings, 0 replies; 13+ messages in thread
From: Andi Kleen @ 2005-08-05 17:12 UTC (permalink / raw)
  To: Adam Litke; +Cc: Andi Kleen, linux-kernel, christoph, dwg

On Fri, Aug 05, 2005 at 12:00:00PM -0500, Adam Litke wrote:
> On Fri, 2005-08-05 at 11:47, Andi Kleen wrote:
> > On Fri, Aug 05, 2005 at 11:37:27AM -0500, Adam Litke wrote:
> > > On Fri, 2005-08-05 at 10:53, Andi Kleen wrote:
> > > > On Fri, Aug 05, 2005 at 10:21:38AM -0500, Adam Litke wrote:
> > > > > Below is a patch to implement demand faulting for huge pages.  The main
> > > > > motivation for changing from prefaulting to demand faulting is so that
> > > > > huge page allocations can follow the NUMA API.  Currently, huge pages
> > > > > are allocated round-robin from all NUMA nodes.   
> > > > 
> > > > I think matching DEFAULT is better than having a different default for
> > > > huge pages than for small pages.
> > > 
> > > I am not exactly sure what the above means.  Is 'DEFAULT' a system
> > > default numa allocation policy?
> > 
> > It's one of the four numa policies: DEFAULT, PREFERED, INTERLEAVE, BIND
> > 
> > It just means allocate on the local node if possible, otherwise fall back.
> > 
> > You said you wanted INTERLEAVE by default, which i think is a bad idea.
> > It should be only optional like in all other allocations.
> 
> I tried to say that allocations are _currently_ INTERLEAVE (aka
> round-robin) but that I want it to be configurable.  So I think we are
> in agreement here.

Ok, but the way to configure it should be the standard mempolicy
infrastructure (that is how it works in SLES9 too) 

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 16:47     ` Andi Kleen
  2005-08-05 17:00       ` Adam Litke
@ 2005-08-05 17:09       ` Christoph Lameter
  1 sibling, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2005-08-05 17:09 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Adam Litke, linux-kernel, dwg

On Fri, 5 Aug 2005, Andi Kleen wrote:

> > Unless I am missing something, the call to follow_hugetlb_page() in
> > get_user_pages() is just an optimization.  Removing it means
> > follow_page() will be called individually for each PAGE_SIZE page in the
> > huge page.  We can probably do better but I didn't want to cloud this
> > patch with that logic.
> 
> The problem is that get_user_pages needs to handle the case of a large
> page not yet being faulted in properly. The SLES9 implementation did
> some changes for this.
> 
> You don't change it at all, so I'm suspect it doesn't work yet.
> 
> It's a common case - think people doing raw IO on huge pages shared memory.

Seems that follow_page calls follow_huge_addr for huge pages. 
follow_huge_addr is arch specific and so we would need to verify that all 
arches do the right things if the page is not present.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC] Demand faulting for large pages
  2005-08-05 15:21 [RFC] Demand faulting for large pages Adam Litke
  2005-08-05 15:53 ` Andi Kleen
@ 2005-08-05 21:05 ` Chen, Kenneth W
  2005-08-05 21:35   ` Andi Kleen
  2005-08-05 21:33 ` Chen, Kenneth W
  2 siblings, 1 reply; 13+ messages in thread
From: Chen, Kenneth W @ 2005-08-05 21:05 UTC (permalink / raw)
  To: 'Adam Litke', linux-kernel; +Cc: ak, christoph, dwg

Adam Litke wrote on Friday, August 05, 2005 8:22 AM
> Below is a patch to implement demand faulting for huge pages.  The main
> motivation for changing from prefaulting to demand faulting is so that
> huge page allocations can follow the NUMA API.  Currently, huge pages
> are allocated round-robin from all NUMA nodes.   

Do users of hugetlb going to accept the fact that now app will SIGBUS
when there aren't enough free hugetlb pages present at the time of fault?
It's not very nice though, but is that the general consensus?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] Demand faulting for large pages
  2005-08-05 21:05 ` Chen, Kenneth W
@ 2005-08-05 21:35   ` Andi Kleen
  0 siblings, 0 replies; 13+ messages in thread
From: Andi Kleen @ 2005-08-05 21:35 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: 'Adam Litke', linux-kernel, ak, christoph, dwg

On Fri, Aug 05, 2005 at 02:05:42PM -0700, Chen, Kenneth W wrote:
> Adam Litke wrote on Friday, August 05, 2005 8:22 AM
> > Below is a patch to implement demand faulting for huge pages.  The main
> > motivation for changing from prefaulting to demand faulting is so that
> > huge page allocations can follow the NUMA API.  Currently, huge pages
> > are allocated round-robin from all NUMA nodes.   
> 
> Do users of hugetlb going to accept the fact that now app will SIGBUS
> when there aren't enough free hugetlb pages present at the time of fault?
> It's not very nice though, but is that the general consensus?

Probably not. But the simple minded overcommit check at mapping that was in the
earlier SLES codebase seemed to work for people (or at least I've never
heard a complaint about that). Adding that should be pretty easy. 

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC] Demand faulting for large pages
  2005-08-05 15:21 [RFC] Demand faulting for large pages Adam Litke
  2005-08-05 15:53 ` Andi Kleen
  2005-08-05 21:05 ` Chen, Kenneth W
@ 2005-08-05 21:33 ` Chen, Kenneth W
  2005-08-05 22:05   ` Chen, Kenneth W
  2 siblings, 1 reply; 13+ messages in thread
From: Chen, Kenneth W @ 2005-08-05 21:33 UTC (permalink / raw)
  To: 'Adam Litke', linux-kernel; +Cc: ak, christoph, dwg

Adam Litke wrote on Friday, August 05, 2005 8:22 AM
> +int hugetlb_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> +			unsigned long address, int write_access)
> +{
> +	int ret = VM_FAULT_MINOR;
> +	unsigned long idx;
> +	pte_t *pte;
> +	struct page *page;
> +	struct address_space *mapping;
> +
> +	WARN_ON(!is_vm_hugetlb_page(vma));

Spurious WARN_ON.  Calls to hugetlb_pte_fault() is conditioned upon 
if (is_vm_hugetlb_page(vma))



> +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> +			unsigned long address, int write_access)
> +{
> +   ....
> +	if (pte_none(*ptep))
> +		rc = hugetlb_pte_fault(mm, vma, address, write_access);
> +}

Broken here.  Return VM_FAULT_SIGBUS when *pte is present??  Why
can't you move all the logic into hugetlb_pte_fault and simply call
it directly from handle_mm_fault?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC] Demand faulting for large pages
  2005-08-05 21:33 ` Chen, Kenneth W
@ 2005-08-05 22:05   ` Chen, Kenneth W
  2005-08-08 22:16     ` Adam Litke
  0 siblings, 1 reply; 13+ messages in thread
From: Chen, Kenneth W @ 2005-08-05 22:05 UTC (permalink / raw)
  To: 'Adam Litke', linux-kernel; +Cc: ak, christoph, dwg

Adam Litke wrote on Friday, August 05, 2005 8:22 AM
> Below is a patch to implement demand faulting for huge pages.  The main
> motivation for changing from prefaulting to demand faulting is so that
> huge page allocations can follow the NUMA API.  Currently, huge pages
> are allocated round-robin from all NUMA nodes.   

Chen, Kenneth W wrote on Friday, August 05, 2005 2:34 PM
> Spurious WARN_ON.  Calls to hugetlb_pte_fault() is conditioned upon 
> if (is_vm_hugetlb_page(vma))
> 
> ....
> 
> Broken here.  Return VM_FAULT_SIGBUS when *pte is present??  Why
> can't you move all the logic into hugetlb_pte_fault and simply call
> it directly from handle_mm_fault?

I'm wondering has this patch ever been tested?  More broken bits:
in arch/i386/mm/hugetlbpage.c:huge_pte_offset - with demand paging,
you can't unconditionally walk the page table without checking
existence of pud and pmd.

I haven't looked closely at recent change in free_pgtables(), but
we used to have a need to scrub old pmd mapping before allocate one
for hugetlb pte on x86.  You have to do that in huge_pte_alloc(),
I'm specifically concerned with arch/i386/mm/hugetlbpage.c:huge_pte_alloc()

- Ken

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC] Demand faulting for large pages
  2005-08-05 22:05   ` Chen, Kenneth W
@ 2005-08-08 22:16     ` Adam Litke
  2005-08-08 22:36       ` Chen, Kenneth W
  0 siblings, 1 reply; 13+ messages in thread
From: Adam Litke @ 2005-08-08 22:16 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: linux-kernel, ak, christoph, dwg

On Fri, 2005-08-05 at 17:05, Chen, Kenneth W wrote:
> Adam Litke wrote on Friday, August 05, 2005 8:22 AM
> > Below is a patch to implement demand faulting for huge pages.  The main
> > motivation for changing from prefaulting to demand faulting is so that
> > huge page allocations can follow the NUMA API.  Currently, huge pages
> > are allocated round-robin from all NUMA nodes.   
> 
> Chen, Kenneth W wrote on Friday, August 05, 2005 2:34 PM
> > Spurious WARN_ON.  Calls to hugetlb_pte_fault() is conditioned upon 
> > if (is_vm_hugetlb_page(vma))
> > 
> > ....
> > 
> > Broken here.  Return VM_FAULT_SIGBUS when *pte is present??  Why
> > can't you move all the logic into hugetlb_pte_fault and simply call
> > it directly from handle_mm_fault?

The reason for the VM_FAULT_SIGBUS default return is because I thought a
fault on a pte_present hugetlb page was an invalid/unhandled fault. 
I'll have another think about races to the fault handler though.

With respect to your code logic comment:  The idea was to make
hugetlb_fault() an entry point into the huge page fault handling code. 
This would make the task of adding other types of faults (Copy on Write
for example) easier later.  If people prefer, it would be easy enough to
roll everything into hugetlb_pte_fault().

> I'm wondering has this patch ever been tested?  More broken bits:
> in arch/i386/mm/hugetlbpage.c:huge_pte_offset - with demand paging,
> you can't unconditionally walk the page table without checking
> existence of pud and pmd.

I have tested the patch to the best extent that I can, but would
definitely appreciate more :)  Thanks for the hint about page table
walking.  I've fixed that up for the next iteration.

> 
> I haven't looked closely at recent change in free_pgtables(), but
> we used to have a need to scrub old pmd mapping before allocate one
> for hugetlb pte on x86.  You have to do that in huge_pte_alloc(),
> I'm specifically concerned with arch/i386/mm/hugetlbpage.c:huge_pte_alloc()

I've definitely been able to produce some strange behavior on 2.6.7
relative to your post about this topic here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0406.2/0234.html 
I confirmed the fix in 2.6.8 and also don't see the problem when using
my demand fault patch.  Do you have a copy of the program you used to
generate the Oops in the post linked above so I can use it as a test
case?  I'd guess either the problem is gone entirely with demand
faulting, or just harder to trigger.

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC] Demand faulting for large pages
  2005-08-08 22:16     ` Adam Litke
@ 2005-08-08 22:36       ` Chen, Kenneth W
  0 siblings, 0 replies; 13+ messages in thread
From: Chen, Kenneth W @ 2005-08-08 22:36 UTC (permalink / raw)
  To: 'Adam Litke'; +Cc: linux-kernel, ak, christoph, dwg

Adam Litke wrote on Monday, August 08, 2005 3:17 PM
> The reason for the VM_FAULT_SIGBUS default return is because I thought a
> fault on a pte_present hugetlb page was an invalid/unhandled fault. 
> I'll have another think about races to the fault handler though.

Two threads fault on the same pte, one won the race, install a pte, 2nd
thread then walk the page table, found the pte inserted by the first
thread, exit with SIGBUS, kernel then kills the entire app with sigbus.


> I've definitely been able to produce some strange behavior on 2.6.7
> relative to your post about this topic here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0406.2/0234.html 
> I confirmed the fix in 2.6.8 and also don't see the problem when using
> my demand fault patch.  Do you have a copy of the program you used to
> generate the Oops in the post linked above so I can use it as a test
> case?  I'd guess either the problem is gone entirely with demand
> faulting, or just harder to trigger.

Demanding paging doesn't solve this problem.  It is related to how pmds
are freed.


- Ken


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2005-08-08 22:37 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-05 15:21 [RFC] Demand faulting for large pages Adam Litke
2005-08-05 15:53 ` Andi Kleen
2005-08-05 16:37   ` Adam Litke
2005-08-05 16:47     ` Andi Kleen
2005-08-05 17:00       ` Adam Litke
2005-08-05 17:12         ` Andi Kleen
2005-08-05 17:09       ` Christoph Lameter
2005-08-05 21:05 ` Chen, Kenneth W
2005-08-05 21:35   ` Andi Kleen
2005-08-05 21:33 ` Chen, Kenneth W
2005-08-05 22:05   ` Chen, Kenneth W
2005-08-08 22:16     ` Adam Litke
2005-08-08 22:36       ` Chen, Kenneth W

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox