[RFC v2][PATCH]page_fault retry with NOPAGE

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
@ 2008-12-05 19:40 Ying Han
  2008-12-06  9:52 ` Török Edwin
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Ying Han @ 2008-12-05 19:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel, akpm, Ingo Molnar, Mike Waychison,
	David Rientjes, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

changelog[v2]:
- reduce the runtime overhead by extending the 'write' flag of
  handle_mm_fault() to indicate the retry hint.
- add another two branches in filemap_fault with retry logic.
- replace find_lock_page with find_lock_page_retry to make the code
  cleaner.

todo:
- there is potential a starvation hole with the retry. By the time the
  retry returns, the pages might be released. we can make change by holding
  page reference as well as remembering what the page "was"(in case the
  file was truncated). any suggestion here are welcomed.

I also made patches for all other arch. I am posting x86_64 here first and
i will post others by the time everyone feels comfortable of this patch.

Edwin, please test this patch with your testcase and check if you get any
performance improvement of mmap over read. I added another two more places
in filemap_fault with retry logic which you might hit in your privous
experiment.


page fault retry with NOPAGE_RETRY
Allow major faults to drop the mmap_sem read lock while waiting for
synchronous disk read. This allows another thread which wishes to grab
down_write(mmap_sem) to proceed while the current is waiting the disk IO.

The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
as identify that the caller can tolerate the retry in the filemap_fault call
patch.

This patch helps a lot in cases we have writer which is waitting behind all
readers, so it could execute much faster.


 Signed-off-by: Mike Waychison <mikew@google.com>
 Signed-off-by: Ying Han <yinghan@google.com>


diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730..5cf5eff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
 #ifdef CONFIG_X86_64
 	unsigned long flags;
 #endif
+	unsigned int retry_flag = FAULT_FLAG_RETRY;

 	tsk = current;
 	mm = tsk->mm;
@@ -689,6 +690,7 @@ again:
 		down_read(&mm->mmap_sem);
 	}

+retry:
 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
@@ -715,6 +717,7 @@ again:
 good_area:
 	si_code = SEGV_ACCERR;
 	write = 0;
+	write |= retry_flag;
 	switch (error_code & (PF_PROT|PF_WRITE)) {
 	default:	/* 3: write, present */
 		/* fall through */
@@ -743,6 +746,15 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
+
+	if (fault & VM_FAULT_RETRY) {
+		if (write & FAULT_FLAG_RETRY) {
+			retry_flag &= ~FAULT_FLAG_RETRY;
+			goto retry;
+		}
+		BUG();
+	}
+
 	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
 	else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..9cc65a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,7 +144,7 @@ extern pgprot_t protection_map[16];

 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-
+#define FAULT_FLAG_RETRY	0x04	/* Retry majoy fault */

 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
+#define VM_FAULT_RETRY	0x0010

 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..aab4a08 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,56 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);

 /**
+ * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
+ * flag is on, and page is already locked by someone else, return a hint of
+ * retry.
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @page: zero if page not present, otherwise point to the page in
+ * pagecache.
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * Return *page==NULL if page is not in pagecache. Otherwise return *page
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+				struct vm_area_struct *vma, struct page **page,
+				int retry)
+{
+	unsigned int ret = 0;
+
+repeat:
+	*page = find_get_page(mapping, offset);
+	if (*page) {
+		if (!retry)
+			lock_page(*page);
+		else {
+			if (!trylock_page(*page)) {
+				struct mm_struct *mm = vma->vm_mm;
+
+				up_read(&mm->mmap_sem);
+				wait_on_page_locked(*page);
+				down_read(&mm->mmap_sem);
+
+				page_cache_release(*page);
+				return VM_FAULT_RETRY;
+			}
+		}
+		if (unlikely((*page)->mapping != mapping)) {
+			unlock_page(*page);
+			page_cache_release(*page);
+			goto repeat;
+		}
+		VM_BUG_ON((*page)->index != offset);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -1444,6 +1494,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	pgoff_t size;
 	int did_readaround = 0;
 	int ret = 0;
+	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+	int retry_ret;

 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1458,6 +1510,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	 */
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
+
+retry_find_nopage:
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1465,9 +1519,12 @@ retry_find:
 		if (!page) {
 			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
+			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+						vma, &page, retry_flag);
 			if (!page)
 				goto no_cached_page;
+			if (retry_ret == VM_FAULT_RETRY)
+				return retry_ret;
 		}
 		if (PageReadahead(page)) {
 			page_cache_async_readahead(mapping, ra, file, page,
@@ -1504,14 +1561,18 @@ retry_find:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+				vma, &page, retry_flag);
 		if (!page)
 			goto no_cached_page;
+		if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
 	}

 	if (!did_readaround)
 		ra->mmap_miss--;

+retry_page_update:
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1547,8 +1608,23 @@ no_cached_page:
 	 * In the unlikely event that someone removed it in the
 	 * meantime, we'll just come back here and read it again.
 	 */
-	if (error >= 0)
-		goto retry_find;
+	if (error >= 0) {
+		/*
+		 * If caller cannot tolerate a retry in the ->fault path
+		 * go back to check the page again.
+		 */
+		if (!retry_flag)
+			goto retry_find;
+
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+					vma, &page, retry_flag);
+		if (!page)
+			goto retry_find_nopage;
+		else if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
+		else
+			goto retry_page_update;
+	}

 	/*
 	 * An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..1ff37f7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
 	vmf.page = NULL;

 	ret = vma->vm_ops->fault(vma, &vmf);
+
+	/* page may be available, but we have to restart the process
+	 * because mmap_sem was dropped during the ->fault
+	 */
+	if (ret == VM_FAULT_RETRY)
+		return ret;
+
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;

@@ -2613,6 +2620,7 @@ static int do_linear_fault(struct mm_struct *mm, struct
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);

+	flags |= (write_access & FAULT_FLAG_RETRY);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-05 19:40 [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY Ying Han
@ 2008-12-06  9:52 ` Török Edwin
  2008-12-06  9:55   ` Török Edwin
  2008-12-09 19:31 ` Andrew Morton
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Török Edwin @ 2008-12-06  9:52 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, akpm, Ingo Molnar, Mike Waychison,
	David Rientjes, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Lee Schermerhorn, Nick Piggin

On 2008-12-05 21:40, Ying Han wrote:
> changelog[v2]:
> - reduce the runtime overhead by extending the 'write' flag of
>   handle_mm_fault() to indicate the retry hint.
> - add another two branches in filemap_fault with retry logic.
> - replace find_lock_page with find_lock_page_retry to make the code
>   cleaner.
>
> todo:
> - there is potential a starvation hole with the retry. By the time the
>   retry returns, the pages might be released. we can make change by holding
>   page reference as well as remembering what the page "was"(in case the
>   file was truncated). any suggestion here are welcomed.
>
> I also made patches for all other arch. I am posting x86_64 here first and
> i will post others by the time everyone feels comfortable of this patch.
>
> Edwin, please test this patch with your testcase and check if you get any
> performance improvement of mmap over read. I added another two more places
> in filemap_fault with retry logic which you might hit in your privous
> experiment.
>   

I get much better results with this patch than with v1, thanks!

mmap now scales almost as well as read does (there is a small ~5%
overhead), which is a significant improvement over not scaling at all!

Here are the results when running my testcase:

Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
read, mmap, mixed, read, mmap, mixed
2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3


Here are the /proc/lock_stat output when running my testcase, contention
is lower (34911+10462 vs 58590+7231), and waittime-total is better
(57 601 464 vs 234 170 024)

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions  
waittime-min   waittime-max waittime-total    acq-bounces  
acquisitions   holdtime-min   holdtime-max holdtime-total
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                        &mm->mmap_sem-W:          5843         
10462           2.89      138824.72    14217159.52         
18965          84205           1.81        5031.07      725293.65
                         &mm->mmap_sem-R:         20208         
34911           4.87      136797.26    57601464.49          55797       
1110394           1.89      164918.52    30551371.71
                         ---------------
                           &mm->mmap_sem           5341         
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
                           &mm->mmap_sem          28579         
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
                           &mm->mmap_sem           5030         
[<ffffffff80211161>] sys_mmap+0xf1/0x140
                           &mm->mmap_sem           6331         
[<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
                         ---------------
                           &mm->mmap_sem          13558         
[<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
                           &mm->mmap_sem           4694         
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
                           &mm->mmap_sem           3681         
[<ffffffff80211161>] sys_mmap+0xf1/0x140
                           &mm->mmap_sem          23374         
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0


On clamd:

Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
514 673), and number of contentions on read
(458 052 vs 5851


lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions  
waittime-min   waittime-max waittime-total    acq-bounces  
acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                         &mm->mmap_sem-W:        346769        
533541           1.62       99819.40   454843342.63        
411259         588486           1.33        6719.62     2395987.75
                         &mm->mmap_sem-R:        197856        
458052           1.59       99800.28   313508721.01        
338158         653427           1.71       25421.10     1493154.95
                         ---------------
                           &mm->mmap_sem         427857         
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
                           &mm->mmap_sem         266464         
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
                           &mm->mmap_sem         251689         
[<ffffffff802110d6>] sys_mmap+0x66/0x140
                           &mm->mmap_sem          15187         
[<ffffffff80211161>] sys_mmap+0xf1/0x140
                         ---------------
                           &mm->mmap_sem         226908         
[<ffffffff802110d6>] sys_mmap+0x66/0x140
                           &mm->mmap_sem         483909         
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
                           &mm->mmap_sem         229404         
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
                           &mm->mmap_sem          13229         
[<ffffffff80211161>] sys_mmap+0xf1/0x140

...............................................................................................................................................................................................

                         &sem->wait_lock:        112617        
114394           0.41         111.20      225590.14       
1517470        6300681           0.27        4103.77     3814684.55
                         ---------------
                         &sem->wait_lock           5634         
[<ffffffff8043a608>] __up_write+0x28/0x170
                         &sem->wait_lock          13595         
[<ffffffff805ce4dc>] __down_read+0x1c/0xbc
                         &sem->wait_lock          38882         
[<ffffffff8043a4a0>] __down_read_trylock+0x20/0x60
                         &sem->wait_lock          30718         
[<ffffffff8043a773>] __up_read+0x23/0xc0
                         ---------------
                         &sem->wait_lock          21389         
[<ffffffff8043a4a0>] __down_read_trylock+0x20/0x60
                         &sem->wait_lock          48959         
[<ffffffff8043a608>] __up_write+0x28/0x170
                         &sem->wait_lock          24330         
[<ffffffff8043a773>] __up_read+0x23/0xc0
                         &sem->wait_lock           9000         
[<ffffffff805ce4dc>] __down_read+0x1c/0xbc


> @@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_SIGBUS	0x0002
>  #define VM_FAULT_MAJOR	0x0004
>  #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
> +#define VM_FAULT_RETRY	0x0010
>
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page
>   

The patch got damaged here, and failed to apply, I added the missing */,
and then git-am -3 applied it.

Best regards,
--Edwin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-06  9:52 ` Török Edwin
@ 2008-12-06  9:55   ` Török Edwin
  2008-12-08  1:43     ` Ying Han
  0 siblings, 1 reply; 16+ messages in thread
From: Török Edwin @ 2008-12-06  9:55 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, akpm, Ingo Molnar, Mike Waychison,
	David Rientjes, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Lee Schermerhorn, Nick Piggin

On 2008-12-06 11:52, Torok Edwin wrote:
> On 2008-12-05 21:40, Ying Han wrote:
>   
>> changelog[v2]:
>> - reduce the runtime overhead by extending the 'write' flag of
>>   handle_mm_fault() to indicate the retry hint.
>> - add another two branches in filemap_fault with retry logic.
>> - replace find_lock_page with find_lock_page_retry to make the code
>>   cleaner.
>>
>> todo:
>> - there is potential a starvation hole with the retry. By the time the
>>   retry returns, the pages might be released. we can make change by holding
>>   page reference as well as remembering what the page "was"(in case the
>>   file was truncated). any suggestion here are welcomed.
>>
>> I also made patches for all other arch. I am posting x86_64 here first and
>> i will post others by the time everyone feels comfortable of this patch.
>>
>> Edwin, please test this patch with your testcase and check if you get any
>> performance improvement of mmap over read. I added another two more places
>> in filemap_fault with retry logic which you might hit in your privous
>> experiment.
>>   
>>     
>
> I get much better results with this patch than with v1, thanks!
>
> mmap now scales almost as well as read does (there is a small ~5%
> overhead), which is a significant improvement over not scaling at all!
>
> Here are the results when running my testcase:
>
> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
> read, mmap, mixed, read, mmap, mixed
> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>
>
> Here are the /proc/lock_stat output when running my testcase, contention
> is lower (34911+10462 vs 58590+7231), and waittime-total is better
> (57 601 464 vs 234 170 024)
>
> lock_stat version 0.3
> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>                               class name    con-bounces    contentions  
> waittime-min   waittime-max waittime-total    acq-bounces  
> acquisitions   holdtime-min   holdtime-max holdtime-total
> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>                         &mm->mmap_sem-W:          5843         
> 10462           2.89      138824.72    14217159.52         
> 18965          84205           1.81        5031.07      725293.65
>                          &mm->mmap_sem-R:         20208         
> 34911           4.87      136797.26    57601464.49          55797       
> 1110394           1.89      164918.52    30551371.71
>                          ---------------
>                            &mm->mmap_sem           5341         
> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>                            &mm->mmap_sem          28579         
> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>                            &mm->mmap_sem           5030         
> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>                            &mm->mmap_sem           6331         
> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>                          ---------------
>                            &mm->mmap_sem          13558         
> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>                            &mm->mmap_sem           4694         
> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>                            &mm->mmap_sem           3681         
> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>                            &mm->mmap_sem          23374         
> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>
>
> On clamd:
>
> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
> 514 673), and number of contentions on read
> (458 052 vs 5851

typo, should have been: 458 052 vs 585 119

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-06  9:55   ` Török Edwin
@ 2008-12-08  1:43     ` Ying Han
  2008-12-09 17:57       ` Ying Han
  0 siblings, 1 reply; 16+ messages in thread
From: Ying Han @ 2008-12-08  1:43 UTC (permalink / raw)
  To: Török Edwin
  Cc: linux-mm, linux-kernel, akpm, Ingo Molnar, Mike Waychison,
	David Rientjes, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Lee Schermerhorn, Nick Piggin

Thanks Torok for your experiment and that sounds great !

--Ying

On Sat, Dec 6, 2008 at 1:55 AM, Torok Edwin <edwintorok@gmail.com> wrote:
> On 2008-12-06 11:52, Torok Edwin wrote:
>> On 2008-12-05 21:40, Ying Han wrote:
>>
>>> changelog[v2]:
>>> - reduce the runtime overhead by extending the 'write' flag of
>>>   handle_mm_fault() to indicate the retry hint.
>>> - add another two branches in filemap_fault with retry logic.
>>> - replace find_lock_page with find_lock_page_retry to make the code
>>>   cleaner.
>>>
>>> todo:
>>> - there is potential a starvation hole with the retry. By the time the
>>>   retry returns, the pages might be released. we can make change by holding
>>>   page reference as well as remembering what the page "was"(in case the
>>>   file was truncated). any suggestion here are welcomed.
>>>
>>> I also made patches for all other arch. I am posting x86_64 here first and
>>> i will post others by the time everyone feels comfortable of this patch.
>>>
>>> Edwin, please test this patch with your testcase and check if you get any
>>> performance improvement of mmap over read. I added another two more places
>>> in filemap_fault with retry logic which you might hit in your privous
>>> experiment.
>>>
>>>
>>
>> I get much better results with this patch than with v1, thanks!
>>
>> mmap now scales almost as well as read does (there is a small ~5%
>> overhead), which is a significant improvement over not scaling at all!
>>
>> Here are the results when running my testcase:
>>
>> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
>> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
>> read, mmap, mixed, read, mmap, mixed
>> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
>> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>>
>>
>> Here are the /proc/lock_stat output when running my testcase, contention
>> is lower (34911+10462 vs 58590+7231), and waittime-total is better
>> (57 601 464 vs 234 170 024)
>>
>> lock_stat version 0.3
>> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>                               class name    con-bounces    contentions
>> waittime-min   waittime-max waittime-total    acq-bounces
>> acquisitions   holdtime-min   holdtime-max holdtime-total
>> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>                         &mm->mmap_sem-W:          5843
>> 10462           2.89      138824.72    14217159.52
>> 18965          84205           1.81        5031.07      725293.65
>>                          &mm->mmap_sem-R:         20208
>> 34911           4.87      136797.26    57601464.49          55797
>> 1110394           1.89      164918.52    30551371.71
>>                          ---------------
>>                            &mm->mmap_sem           5341
>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>                            &mm->mmap_sem          28579
>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>                            &mm->mmap_sem           5030
>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>                            &mm->mmap_sem           6331
>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>                          ---------------
>>                            &mm->mmap_sem          13558
>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>                            &mm->mmap_sem           4694
>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>                            &mm->mmap_sem           3681
>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>                            &mm->mmap_sem          23374
>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>
>>
>> On clamd:
>>
>> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
>> 514 673), and number of contentions on read
>> (458 052 vs 5851
>
> typo, should have been: 458 052 vs 585 119
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-08  1:43     ` Ying Han
@ 2008-12-09 17:57       ` Ying Han
  0 siblings, 0 replies; 16+ messages in thread
From: Ying Han @ 2008-12-09 17:57 UTC (permalink / raw)
  To: Török Edwin
  Cc: linux-mm, linux-kernel, akpm, Ingo Molnar, Mike Waychison,
	David Rientjes, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Lee Schermerhorn, Nick Piggin

Andrew, if it sounds good to include this patch in -mm at this point?

thanks

--Ying

On Sun, Dec 7, 2008 at 5:43 PM, Ying Han <yinghan@google.com> wrote:
> Thanks Torok for your experiment and that sounds great !
>
> --Ying
>
> On Sat, Dec 6, 2008 at 1:55 AM, Torok Edwin <edwintorok@gmail.com> wrote:
>> On 2008-12-06 11:52, Torok Edwin wrote:
>>> On 2008-12-05 21:40, Ying Han wrote:
>>>
>>>> changelog[v2]:
>>>> - reduce the runtime overhead by extending the 'write' flag of
>>>>   handle_mm_fault() to indicate the retry hint.
>>>> - add another two branches in filemap_fault with retry logic.
>>>> - replace find_lock_page with find_lock_page_retry to make the code
>>>>   cleaner.
>>>>
>>>> todo:
>>>> - there is potential a starvation hole with the retry. By the time the
>>>>   retry returns, the pages might be released. we can make change by holding
>>>>   page reference as well as remembering what the page "was"(in case the
>>>>   file was truncated). any suggestion here are welcomed.
>>>>
>>>> I also made patches for all other arch. I am posting x86_64 here first and
>>>> i will post others by the time everyone feels comfortable of this patch.
>>>>
>>>> Edwin, please test this patch with your testcase and check if you get any
>>>> performance improvement of mmap over read. I added another two more places
>>>> in filemap_fault with retry logic which you might hit in your privous
>>>> experiment.
>>>>
>>>>
>>>
>>> I get much better results with this patch than with v1, thanks!
>>>
>>> mmap now scales almost as well as read does (there is a small ~5%
>>> overhead), which is a significant improvement over not scaling at all!
>>>
>>> Here are the results when running my testcase:
>>>
>>> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
>>> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
>>> read, mmap, mixed, read, mmap, mixed
>>> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
>>> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>>>
>>>
>>> Here are the /proc/lock_stat output when running my testcase, contention
>>> is lower (34911+10462 vs 58590+7231), and waittime-total is better
>>> (57 601 464 vs 234 170 024)
>>>
>>> lock_stat version 0.3
>>> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>>                               class name    con-bounces    contentions
>>> waittime-min   waittime-max waittime-total    acq-bounces
>>> acquisitions   holdtime-min   holdtime-max holdtime-total
>>> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>>                         &mm->mmap_sem-W:          5843
>>> 10462           2.89      138824.72    14217159.52
>>> 18965          84205           1.81        5031.07      725293.65
>>>                          &mm->mmap_sem-R:         20208
>>> 34911           4.87      136797.26    57601464.49          55797
>>> 1110394           1.89      164918.52    30551371.71
>>>                          ---------------
>>>                            &mm->mmap_sem           5341
>>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>>                            &mm->mmap_sem          28579
>>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>>                            &mm->mmap_sem           5030
>>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>>                            &mm->mmap_sem           6331
>>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>>                          ---------------
>>>                            &mm->mmap_sem          13558
>>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>>                            &mm->mmap_sem           4694
>>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>>                            &mm->mmap_sem           3681
>>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>>                            &mm->mmap_sem          23374
>>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>>
>>>
>>> On clamd:
>>>
>>> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
>>> 514 673), and number of contentions on read
>>> (458 052 vs 5851
>>
>> typo, should have been: 458 052 vs 585 119
>>
>>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-05 19:40 [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY Ying Han
  2008-12-06  9:52 ` Török Edwin
@ 2008-12-09 19:31 ` Andrew Morton
  2009-01-26 19:37 ` Andrew Morton
  2009-03-31 22:00 ` Andrew Morton
  3 siblings, 0 replies; 16+ messages in thread
From: Andrew Morton @ 2008-12-09 19:31 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, mingo, mikew, rientjes, rohitseth, hugh,
	a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin

On Fri, 5 Dec 2008 11:40:19 -0800
Ying Han <yinghan@google.com> wrote:

> changelog[v2]:
> - reduce the runtime overhead by extending the 'write' flag of
>   handle_mm_fault() to indicate the retry hint.
> - add another two branches in filemap_fault with retry logic.
> - replace find_lock_page with find_lock_page_retry to make the code
>   cleaner.
> 
> todo:
> - there is potential a starvation hole with the retry. By the time the
>   retry returns, the pages might be released. we can make change by holding
>   page reference as well as remembering what the page "was"(in case the
>   file was truncated). any suggestion here are welcomed.
> 
> I also made patches for all other arch. I am posting x86_64 here first and
> i will post others by the time everyone feels comfortable of this patch.
> 
> Edwin, please test this patch with your testcase and check if you get any
> performance improvement of mmap over read. I added another two more places
> in filemap_fault with retry logic which you might hit in your privous
> experiment.
> 
> 
> page fault retry with NOPAGE_RETRY
> Allow major faults to drop the mmap_sem read lock while waiting for
> synchronous disk read. This allows another thread which wishes to grab
> down_write(mmap_sem) to proceed while the current is waiting the disk IO.
> 
> The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
> as identify that the caller can tolerate the retry in the filemap_fault call
> patch.
> 
> This patch helps a lot in cases we have writer which is waitting behind all
> readers, so it could execute much faster.
> 
> 
>  Signed-off-by: Mike Waychison <mikew@google.com>
>  Signed-off-by: Ying Han <yinghan@google.com>
> 

It would be useful if the changelog were to describe the performance
benefits of the patch.  I mean, the whole point of the patch is to
improve throughpuyt/latency/etc, but we see no evidence here that it
_does_ this?

> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 31e8730..5cf5eff 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
>  #ifdef CONFIG_X86_64
>  	unsigned long flags;
>  #endif
> +	unsigned int retry_flag = FAULT_FLAG_RETRY;
> 
>  	tsk = current;
>  	mm = tsk->mm;
> @@ -689,6 +690,7 @@ again:
>  		down_read(&mm->mmap_sem);
>  	}
> 
> +retry:
>  	vma = find_vma(mm, address);
>  	if (!vma)
>  		goto bad_area;
> @@ -715,6 +717,7 @@ again:
>  good_area:
>  	si_code = SEGV_ACCERR;
>  	write = 0;
> +	write |= retry_flag;
>  	switch (error_code & (PF_PROT|PF_WRITE)) {
>  	default:	/* 3: write, present */
>  		/* fall through */
> @@ -743,6 +746,15 @@ good_area:
>  			goto do_sigbus;
>  		BUG();
>  	}
> +
> +	if (fault & VM_FAULT_RETRY) {
> +		if (write & FAULT_FLAG_RETRY) {
> +			retry_flag &= ~FAULT_FLAG_RETRY;

What are we doing here?  We appear to be retrying the fault once, then
we switch to synchronous mode?  There is no description of this in the
changelog and there is no comment explaining the reasons for this
design.  There is no way in which readers of this code will be able to
understand why it is implemented in this fashion.

And a *lot* of people will want to know why this was done this way. 
Starting with about twenty arch maintainers!

Please add a comment which completely describes this code section.


> +			goto retry;
> +		}
> +		BUG();
> +	}
> +
>  	if (fault & VM_FAULT_MAJOR)
>  		tsk->maj_flt++;
>  	else
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..9cc65a5 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,7 +144,7 @@ extern pgprot_t protection_map[16];
> 
>  #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
>  #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
> -
> +#define FAULT_FLAG_RETRY	0x04	/* Retry majoy fault */
> 
>  /*
>   * vm_fault is filled by the the pagefault handler and passed to the vma's
> @@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_SIGBUS	0x0002
>  #define VM_FAULT_MAJOR	0x0004
>  #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
> +#define VM_FAULT_RETRY	0x0010
> 
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page
>  #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..aab4a08 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,56 @@ repeat:
>  EXPORT_SYMBOL(find_lock_page);
> 
>  /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
> + * flag is on, and page is already locked by someone else, return a hint of
> + * retry.

kerneldoc does not support linewrapping in this context.  It has to be
done on a single line.  Please see the appended patch.

> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @page: zero if page not present, otherwise point to the page in
> + * pagecache.

Ditto here.

> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * Return *page==NULL if page is not in pagecache. Otherwise return *page
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> +				struct vm_area_struct *vma, struct page **page,
> +				int retry)
> +{
> +	unsigned int ret = 0;
> +
> +repeat:
> +	*page = find_get_page(mapping, offset);
> +	if (*page) {
> +		if (!retry)
> +			lock_page(*page);
> +		else {
> +			if (!trylock_page(*page)) {
> +				struct mm_struct *mm = vma->vm_mm;
> +
> +				up_read(&mm->mmap_sem);
> +				wait_on_page_locked(*page);
> +				down_read(&mm->mmap_sem);
> +
> +				page_cache_release(*page);
> +				return VM_FAULT_RETRY;
> +			}
> +		}
> +		if (unlikely((*page)->mapping != mapping)) {
> +			unlock_page(*page);
> +			page_cache_release(*page);
> +			goto repeat;
> +		}
> +		VM_BUG_ON((*page)->index != offset);
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);

The patch adds no declaration for find_lock_page_retry() in any header
file.  See appended patch.

The frequent dereferencing of `page' is a bit ungainly, and adds risk
that gcc will generate poor code (my version of gcc does manage to do
the right thing, however).  I do think the code woukld look better if
we added a local for this.  See appended patch.

MM developers expect a variable called `page' to have type `struct page
*'.  This function violates that by adding `struct page **page'.  See
appended patch.


> +/**
>   * find_or_create_page - locate or add a pagecache page
>   * @mapping: the page's address_space
>   * @index: the page's index into the mapping
> @@ -1444,6 +1494,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	pgoff_t size;
>  	int did_readaround = 0;
>  	int ret = 0;
> +	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> +	int retry_ret;
> 
>  	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -1458,6 +1510,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	 */
>  retry_find:
>  	page = find_lock_page(mapping, vmf->pgoff);
> +
> +retry_find_nopage:
>  	/*
>  	 * For sequential accesses, we use the generic readahead logic.
>  	 */
> @@ -1465,9 +1519,12 @@ retry_find:
>  		if (!page) {
>  			page_cache_sync_readahead(mapping, ra, file,
>  							   vmf->pgoff, 1);
> -			page = find_lock_page(mapping, vmf->pgoff);
> +			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +						vma, &page, retry_flag);
>  			if (!page)
>  				goto no_cached_page;
> +			if (retry_ret == VM_FAULT_RETRY)
> +				return retry_ret;
>  		}
>  		if (PageReadahead(page)) {
>  			page_cache_async_readahead(mapping, ra, file, page,
> @@ -1504,14 +1561,18 @@ retry_find:
>  				start = vmf->pgoff - ra_pages / 2;
>  			do_page_cache_readahead(mapping, file, start, ra_pages);
>  		}
> -		page = find_lock_page(mapping, vmf->pgoff);
> +		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +				vma, &page, retry_flag);
>  		if (!page)
>  			goto no_cached_page;
> +		if (retry_ret == VM_FAULT_RETRY)
> +			return retry_ret;
>  	}
> 
>  	if (!did_readaround)
>  		ra->mmap_miss--;
> 
> +retry_page_update:
>  	/*
>  	 * We have a locked page in the page cache, now we need to check
>  	 * that it's up-to-date. If not, it is going to be due to an error.
> @@ -1547,8 +1608,23 @@ no_cached_page:
>  	 * In the unlikely event that someone removed it in the
>  	 * meantime, we'll just come back here and read it again.
>  	 */
> -	if (error >= 0)
> -		goto retry_find;
> +	if (error >= 0) {
> +		/*
> +		 * If caller cannot tolerate a retry in the ->fault path
> +		 * go back to check the page again.
> +		 */
> +		if (!retry_flag)
> +			goto retry_find;
> +
> +		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +					vma, &page, retry_flag);
> +		if (!page)
> +			goto retry_find_nopage;
> +		else if (retry_ret == VM_FAULT_RETRY)
> +			return retry_ret;
> +		else
> +			goto retry_page_update;
> +	}
> 
>  	/*
>  	 * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..1ff37f7 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>  	vmf.page = NULL;
> 
>  	ret = vma->vm_ops->fault(vma, &vmf);
> +
> +	/* page may be available, but we have to restart the process
> +	 * because mmap_sem was dropped during the ->fault
> +	 */
> +	if (ret == VM_FAULT_RETRY)
> +		return ret;

Shouldn't this be

	if (ret & VM_FAULT_RETRY)

?

It may not make any difference in practice with present ->fault
implementations, but that's through sheer luck.

>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>  		return ret;
> 
> @@ -2613,6 +2620,7 @@ static int do_linear_fault(struct mm_struct *mm, struct
>  			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>  	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> 
> +	flags |= (write_access & FAULT_FLAG_RETRY);
>  	pte_unmap(page_table);
>  	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>  }


 include/linux/pagemap.h |    3 +++
 mm/filemap.c            |   38 ++++++++++++++++++++------------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix arch/x86/mm/fault.c
diff -puN include/linux/mm.h~page_fault-retry-with-nopage_retry-fix include/linux/mm.h
diff -puN mm/filemap.c~page_fault-retry-with-nopage_retry-fix mm/filemap.c
--- a/mm/filemap.c~page_fault-retry-with-nopage_retry-fix
+++ a/mm/filemap.c
@@ -714,51 +714,53 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);
 
 /**
- * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
- * flag is on, and page is already locked by someone else, return a hint of
- * retry.
+ * find_lock_page_retry - locate, pin and lock a pagecache page
  * @mapping: the address_space to search
  * @offset: the page index
  * @vma: vma in which the fault was taken
- * @page: zero if page not present, otherwise point to the page in
- * pagecache.
+ * @ppage: zero if page not present, otherwise point to the page in pagecache.
  * @retry: 1 indicate caller tolerate a retry.
  *
- * Return *page==NULL if page is not in pagecache. Otherwise return *page
+ * If retry flag is on, and page is already locked by someone else, return a
+ * hint of retry.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
  * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
- * hint to caller for retry, or ret=0 which means page is succefully
+ * hint to caller for retry, or ret=0 which means page is successfully
  * locked.
  */
 unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
-				struct vm_area_struct *vma, struct page **page,
+				struct vm_area_struct *vma, struct page **ppage,
 				int retry)
 {
 	unsigned int ret = 0;
+	struct page *page;
 
 repeat:
-	*page = find_get_page(mapping, offset);
-	if (*page) {
+	page = find_get_page(mapping, offset);
+	if (page) {
 		if (!retry)
-			lock_page(*page);
+			lock_page(page);
 		else {
-			if (!trylock_page(*page)) {
+			if (!trylock_page(page)) {
 				struct mm_struct *mm = vma->vm_mm;
 
 				up_read(&mm->mmap_sem);
-				wait_on_page_locked(*page);
+				wait_on_page_locked(page);
 				down_read(&mm->mmap_sem);
 
-				page_cache_release(*page);
+				page_cache_release(page);
 				return VM_FAULT_RETRY;
 			}
 		}
-		if (unlikely((*page)->mapping != mapping)) {
-			unlock_page(*page);
-			page_cache_release(*page);
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
 			goto repeat;
 		}
-		VM_BUG_ON((*page)->index != offset);
+		VM_BUG_ON(page->index != offset);
 	}
+	*ppage = page;
 	return ret;
 }
 EXPORT_SYMBOL(find_lock_page_retry);
diff -puN mm/memory.c~page_fault-retry-with-nopage_retry-fix mm/memory.c
diff -puN include/linux/pagemap.h~page_fault-retry-with-nopage_retry-fix include/linux/pagemap.h
--- a/include/linux/pagemap.h~page_fault-retry-with-nopage_retry-fix
+++ a/include/linux/pagemap.h
@@ -232,6 +232,9 @@ extern struct page * find_get_page(struc
 				pgoff_t index);
 extern struct page * find_lock_page(struct address_space *mapping,
 				pgoff_t index);
+extern unsigned find_lock_page_retry(struct address_space *mapping,
+			pgoff_t offset, struct vm_area_struct *vma,
+			struct page **ppage, int retry)
 extern struct page * find_or_create_page(struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-05 19:40 [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY Ying Han
  2008-12-06  9:52 ` Török Edwin
  2008-12-09 19:31 ` Andrew Morton
@ 2009-01-26 19:37 ` Andrew Morton
       [not found]   ` <604427e00901261508n7967ea74m3deacd3213c86065@mail.gmail.com>
  2009-03-31 22:00 ` Andrew Morton
  3 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2009-01-26 19:37 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, mingo, mikew, rientjes, rohitseth, hugh,
	a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin

On Fri, 5 Dec 2008 11:40:19 -0800
Ying Han <yinghan@google.com> wrote:

> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
>  #ifdef CONFIG_X86_64
>  	unsigned long flags;
>  #endif
> +	unsigned int retry_flag = FAULT_FLAG_RETRY;
> 
>  	tsk = current;
>  	mm = tsk->mm;
> @@ -689,6 +690,7 @@ again:
>  		down_read(&mm->mmap_sem);
>  	}
> 
> +retry:
>  	vma = find_vma(mm, address);
>  	if (!vma)
>  		goto bad_area;
> @@ -715,6 +717,7 @@ again:
>  good_area:
>  	si_code = SEGV_ACCERR;
>  	write = 0;
> +	write |= retry_flag;
>  	switch (error_code & (PF_PROT|PF_WRITE)) {
>  	default:	/* 3: write, present */
>  		/* fall through */
> @@ -743,6 +746,15 @@ good_area:
>  			goto do_sigbus;
>  		BUG();
>  	}
> +
> +	if (fault & VM_FAULT_RETRY) {
> +		if (write & FAULT_FLAG_RETRY) {
> +			retry_flag &= ~FAULT_FLAG_RETRY;
> +			goto retry;
> +		}
> +		BUG();
> +	}
> +
>  	if (fault & VM_FAULT_MAJOR)
>  		tsk->maj_flt++;
>  	else

This code is mixing flags from the FAULT_FLAG_foor domain into local
variable `write'.  But that's inappropriate because `write' is a
boolean, and in one of Ingo's trees, `write' gets bits other than bit 0
set, and it all generally ends up a mess.

Can we not do that?  I assume that a previous version of this patch
kept those things separated?

Something like this, I think?

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix arch/x86/mm/fault.c
--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
+++ a/arch/x86/mm/fault.c
@@ -799,7 +799,7 @@ void __kprobes do_page_fault(struct pt_r
 	struct vm_area_struct *vma;
 	int write;
 	int fault;
-	unsigned int retry_flag = FAULT_FLAG_RETRY;
+	int retry_flag = 1;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -951,6 +951,7 @@ good_area:
 	}
 
 	write |= retry_flag;
+
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
@@ -969,8 +970,8 @@ good_area:
 	 * be removed or changed after the retry.
 	 */
 	if (fault & VM_FAULT_RETRY) {
-		if (write & FAULT_FLAG_RETRY) {
-			retry_flag &= ~FAULT_FLAG_RETRY;
+		if (retry_flag) {
+			retry_flag = 0;
 			goto retry;
 		}
 		BUG();


Question: why is this code passing `write==true' into handle_mm_fault()
in the retry case?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
       [not found]   ` <604427e00901261508n7967ea74m3deacd3213c86065@mail.gmail.com>
@ 2009-01-26 23:52     ` Andrew Morton
  2009-01-26 23:57       ` Ingo Molnar
  0 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2009-01-26 23:52 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, mingo, mikew, rientjes, rohitseth, hugh,
	a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin

On Mon, 26 Jan 2009 15:08:48 -0800
Ying Han <yinghan@google.com> wrote:

> On Mon, Jan 26, 2009 at 11:37 AM, Andrew Morton
> <akpm@linux-foundation.org>wrote:
> 
> > On Fri, 5 Dec 2008 11:40:19 -0800
> > Ying Han <yinghan@google.com> wrote:
> >
> > > --- a/arch/x86/mm/fault.c
> > > +++ b/arch/x86/mm/fault.c
> > > @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs,
> > unsigne
> > >  #ifdef CONFIG_X86_64
> > >       unsigned long flags;
> > >  #endif
> > > +     unsigned int retry_flag = FAULT_FLAG_RETRY;
> > >
> > >       tsk = current;
> > >       mm = tsk->mm;
> > > @@ -689,6 +690,7 @@ again:
> > >               down_read(&mm->mmap_sem);
> > >       }
> > >
> > > +retry:
> > >       vma = find_vma(mm, address);
> > >       if (!vma)
> > >               goto bad_area;
> > > @@ -715,6 +717,7 @@ again:
> > >  good_area:
> > >       si_code = SEGV_ACCERR;
> > >       write = 0;
> > > +     write |= retry_flag;
> > >       switch (error_code & (PF_PROT|PF_WRITE)) {
> > >       default:        /* 3: write, present */
> > >               /* fall through */
> > > @@ -743,6 +746,15 @@ good_area:
> > >                       goto do_sigbus;
> > >               BUG();
> > >       }
> > > +
> > > +     if (fault & VM_FAULT_RETRY) {
> > > +             if (write & FAULT_FLAG_RETRY) {
> > > +                     retry_flag &= ~FAULT_FLAG_RETRY;
> > > +                     goto retry;
> > > +             }
> > > +             BUG();
> > > +     }
> > > +
> > >       if (fault & VM_FAULT_MAJOR)
> > >               tsk->maj_flt++;
> > >       else
> >
> > This code is mixing flags from the FAULT_FLAG_foor domain into local
> > variable `write'.  But that's inappropriate because `write' is a
> > boolean, and in one of Ingo's trees, `write' gets bits other than bit 0
> > set, and it all generally ends up a mess.
> >
> > Can we not do that?  I assume that a previous version of this patch
> > kept those things separated?
> >
> > Something like this, I think?
> >
> > diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
> > arch/x86/mm/fault.c
> > --- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
> > +++ a/arch/x86/mm/fault.c
> > @@ -799,7 +799,7 @@ void __kprobes do_page_fault(struct pt_r
> >        struct vm_area_struct *vma;
> >        int write;
> >        int fault;
> > -       unsigned int retry_flag = FAULT_FLAG_RETRY;
> > +       int retry_flag = 1;
> >
> >        tsk = current;
> >        mm = tsk->mm;
> > @@ -951,6 +951,7 @@ good_area:
> >        }
> >
> >        write |= retry_flag;
> > +
> >        /*
> >         * If for any reason at all we couldn't handle the fault,
> >         * make sure we exit gracefully rather than endlessly redo
> > @@ -969,8 +970,8 @@ good_area:
> >         * be removed or changed after the retry.
> >         */
> >         if (fault & VM_FAULT_RETRY) {
> > -               if (write & FAULT_FLAG_RETRY) {
> > -                       retry_flag &= ~FAULT_FLAG_RETRY;
> > +               if (retry_flag) {
> > +                       retry_flag = 0;
> >                        goto retry;
> >                }
> >                BUG();
> 
> with this change, 'write' still gets bits other than bit 0
> set in the case of 'write, not present' and the Ingo's problem remains, am i
> missing something here?

umm, yes.  This?

--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix-fix
+++ a/arch/x86/mm/fault.c
@@ -950,8 +950,6 @@ good_area:
 		return;
 	}
 
-	write |= retry_flag;
-
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
_


(I should just give up here - doing too many things at once)

> >
> >
> >
> > Question: why is this code passing `write==true' into handle_mm_fault()
> > in the retry case?
> > Here i am using unused bit of "write" to carry FAULT_FLAG_RETRY flag down
> > to the handle_mm_fault(). Meanwhile, "write" still have its read/write bit
> > set as it is before. It is true that 'write == true' in the retry patch, but
> > i did the correct interpretation in
> 
> 
> 
> > static int do_linear_fault() {
> >
>          int write = write_access & ~FAULT_FLAG_RETRY;
>          unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> 
>          flags |= (write_access & FAULT_FLAG_RETRY);
>          pte_unmap(page_table);
>          return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }


OK, this is horridly confusing.  Is `write_access' a boolean, as its
name implies, or is it a bunch of flags?

If we're going to turn it into a bunch of flags then it should be
renamed!  And callsites such as do_page_fault() should rename their
local variable `write' to something which accurately conveys the new
usage.  And various code comments in mm/memory.c (which don't appear to
exist) should be updated.

I think that a good way to present this is as a preparatory patch:
"convert the fourth argument to handle_mm_fault() from a boolean to a
flags word".  That would be a simple do-nothing patch which affects all
architectures and which ideally would break the build at any
unconverted code sites.  (Change the argument order?)

What do you think?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2009-01-26 23:52     ` Andrew Morton
@ 2009-01-26 23:57       ` Ingo Molnar
  2009-01-27  4:34         ` KOSAKI Motohiro
  0 siblings, 1 reply; 16+ messages in thread
From: Ingo Molnar @ 2009-01-26 23:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, mikew, rientjes, rohitseth,
	hugh, a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin


* Andrew Morton <akpm@linux-foundation.org> wrote:

> I think that a good way to present this is as a preparatory patch: 
> "convert the fourth argument to handle_mm_fault() from a boolean to a 
> flags word".  That would be a simple do-nothing patch which affects all 
> architectures and which ideally would break the build at any unconverted 
> code sites.  (Change the argument order?)

why not do what i suggested: refactor do_page_fault() into a platform 
specific / kernel-internal faults and into a generic-user-pte function. 
That alone would increase readability i suspect.

Then the 'retry' is multiple calls from handle_pte_fault().

Or something like that.

It looks wrong to me to pass another flag through this hot codepath, just 
to express a property that the _highlevel_ code is interested in.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2009-01-26 23:57       ` Ingo Molnar
@ 2009-01-27  4:34         ` KOSAKI Motohiro
  0 siblings, 0 replies; 16+ messages in thread
From: KOSAKI Motohiro @ 2009-01-27  4:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: kosaki.motohiro, Andrew Morton, Ying Han, linux-mm, linux-kernel,
	mikew, rientjes, rohitseth, hugh, a.p.zijlstra, hpa, edwintorok,
	lee.schermerhorn, npiggin

> 
> * Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > I think that a good way to present this is as a preparatory patch: 
> > "convert the fourth argument to handle_mm_fault() from a boolean to a 
> > flags word".  That would be a simple do-nothing patch which affects all 
> > architectures and which ideally would break the build at any unconverted 
> > code sites.  (Change the argument order?)
> 
> why not do what i suggested: refactor do_page_fault() into a platform 
> specific / kernel-internal faults and into a generic-user-pte function. 
> That alone would increase readability i suspect.
> 
> Then the 'retry' is multiple calls from handle_pte_fault().
> 
> Or something like that.
> 
> It looks wrong to me to pass another flag through this hot codepath, just 
> to express a property that the _highlevel_ code is interested in.

I like this idea :)



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2008-12-05 19:40 [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY Ying Han
                   ` (2 preceding siblings ...)
  2009-01-26 19:37 ` Andrew Morton
@ 2009-03-31 22:00 ` Andrew Morton
  2009-04-01  0:17   ` Ying Han
  2009-04-03  8:22   ` [PATCH] vfs: fix find_lock_page_retry() return value parsing Wu Fengguang
  3 siblings, 2 replies; 16+ messages in thread
From: Andrew Morton @ 2009-03-31 22:00 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, mingo, mikew, rientjes, rohitseth, hugh,
	a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin

On Fri, 5 Dec 2008 11:40:19 -0800
Ying Han <yinghan@google.com> wrote:

> changelog[v2]:
> - reduce the runtime overhead by extending the 'write' flag of
>   handle_mm_fault() to indicate the retry hint.
> - add another two branches in filemap_fault with retry logic.
> - replace find_lock_page with find_lock_page_retry to make the code
>   cleaner.
> 
> todo:
> - there is potential a starvation hole with the retry. By the time the
>   retry returns, the pages might be released. we can make change by holding
>   page reference as well as remembering what the page "was"(in case the
>   file was truncated). any suggestion here are welcomed.
> 
> I also made patches for all other arch. I am posting x86_64 here first and
> i will post others by the time everyone feels comfortable of this patch.

I'm about to send this into Linus.  What happened to the patches for
other architectures?

Please send them over when convenient and I'll work on getting them
trickled out to arch maintainers, thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY
  2009-03-31 22:00 ` Andrew Morton
@ 2009-04-01  0:17   ` Ying Han
  2009-04-03  8:22   ` [PATCH] vfs: fix find_lock_page_retry() return value parsing Wu Fengguang
  1 sibling, 0 replies; 16+ messages in thread
From: Ying Han @ 2009-04-01  0:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, mingo, mikew, rientjes, rohitseth, hugh,
	a.p.zijlstra, hpa, edwintorok, lee.schermerhorn, npiggin

Thanks Andrew. I have the patches for all the arches and i just need
to clean them up a little bit. I will send them to you ASAP.

--Ying

On Tue, Mar 31, 2009 at 3:00 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Fri, 5 Dec 2008 11:40:19 -0800
> Ying Han <yinghan@google.com> wrote:
>
>> changelog[v2]:
>> - reduce the runtime overhead by extending the 'write' flag of
>>   handle_mm_fault() to indicate the retry hint.
>> - add another two branches in filemap_fault with retry logic.
>> - replace find_lock_page with find_lock_page_retry to make the code
>>   cleaner.
>>
>> todo:
>> - there is potential a starvation hole with the retry. By the time the
>>   retry returns, the pages might be released. we can make change by holding
>>   page reference as well as remembering what the page "was"(in case the
>>   file was truncated). any suggestion here are welcomed.
>>
>> I also made patches for all other arch. I am posting x86_64 here first and
>> i will post others by the time everyone feels comfortable of this patch.
>
> I'm about to send this into Linus.  What happened to the patches for
> other architectures?
>
> Please send them over when convenient and I'll work on getting them
> trickled out to arch maintainers, thanks.
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] vfs: fix find_lock_page_retry() return value parsing
  2009-03-31 22:00 ` Andrew Morton
  2009-04-01  0:17   ` Ying Han
@ 2009-04-03  8:22   ` Wu Fengguang
  2009-04-03  8:35     ` [PATCH v2] " Wu Fengguang
  1 sibling, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-03  8:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, mingo, mikew, rientjes,
	rohitseth, hugh, a.p.zijlstra, hpa, edwintorok, lee.schermerhorn,
	npiggin

find_lock_page_retry() won't touch the *ppage value when returning
VM_FAULT_RETRY. This is fine except for the case

        if (VM_RandomReadHint())
                goto no_cached_page;

where the 'page' could be undefined after calling find_lock_page_retry().

Fix it by checking the VM_FAULT_RETRY case first.

Cc: Ying Han <yinghan@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/filemap.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -759,7 +759,7 @@ EXPORT_SYMBOL(find_lock_page);
  * @retry: 1 indicate caller tolerate a retry.
  *
  * If retry flag is on, and page is already locked by someone else, return
- * a hint of retry.
+ * a hint of retry and leave *ppage untouched.
  *
  * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
  * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
@@ -1672,10 +1672,10 @@ no_cached_page:
 
 		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 					vma, &page, retry_flag);
+		if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
 		if (!page)
 			goto retry_find_nopage;
-		else if (retry_ret == VM_FAULT_RETRY)
-			return retry_ret;
 		else
 			goto retry_page_update;
 	}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] vfs: fix find_lock_page_retry() return value parsing
  2009-04-03  8:22   ` [PATCH] vfs: fix find_lock_page_retry() return value parsing Wu Fengguang
@ 2009-04-03  8:35     ` Wu Fengguang
  2009-04-03  8:55       ` [PATCH] vfs: reduce page fault retry code Wu Fengguang
  0 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-03  8:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, mingo, mikew, rientjes,
	rohitseth, hugh, a.p.zijlstra, hpa, edwintorok, lee.schermerhorn,
	npiggin

find_lock_page_retry() won't touch the *ppage value when returning
VM_FAULT_RETRY. This is fine except for the case

        if (VM_RandomReadHint())
                goto no_cached_page;

where the 'page' could be undefined after calling find_lock_page_retry().

Fix it by checking the VM_FAULT_RETRY case first. Also do this for the
other two find_lock_page_retry() invocations for the sake of consistency.

Cc: Ying Han <yinghan@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/filemap.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -759,7 +759,7 @@ EXPORT_SYMBOL(find_lock_page);
  * @retry: 1 indicate caller tolerate a retry.
  *
  * If retry flag is on, and page is already locked by someone else, return
- * a hint of retry.
+ * a hint of retry and leave *ppage untouched.
  *
  * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
  * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
@@ -1575,10 +1575,10 @@ retry_find_nopage:
 							   vmf->pgoff, 1);
 			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 						vma, &page, retry_flag);
-			if (!page)
-				goto no_cached_page;
 			if (retry_ret == VM_FAULT_RETRY)
 				return retry_ret;
+			if (!page)
+				goto no_cached_page;
 		}
 		if (PageReadahead(page)) {
 			page_cache_async_readahead(mapping, ra, file, page,
@@ -1617,10 +1617,10 @@ retry_find_nopage:
 		}
 		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 				vma, &page, retry_flag);
-		if (!page)
-			goto no_cached_page;
 		if (retry_ret == VM_FAULT_RETRY)
 			return retry_ret;
+		if (!page)
+			goto no_cached_page;
 	}
 
 	if (!did_readaround)
@@ -1672,10 +1672,10 @@ no_cached_page:
 
 		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 					vma, &page, retry_flag);
+		if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
 		if (!page)
 			goto retry_find_nopage;
-		else if (retry_ret == VM_FAULT_RETRY)
-			return retry_ret;
 		else
 			goto retry_page_update;
 	}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] vfs: reduce page fault retry code
  2009-04-03  8:35     ` [PATCH v2] " Wu Fengguang
@ 2009-04-03  8:55       ` Wu Fengguang
  2009-04-03 10:53         ` Wu Fengguang
  0 siblings, 1 reply; 16+ messages in thread
From: Wu Fengguang @ 2009-04-03  8:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, mingo, mikew, rientjes,
	rohitseth, hugh, a.p.zijlstra, hpa, edwintorok, lee.schermerhorn,
	npiggin

find_lock_page_retry() works the same way as find_lock_page()
when retry_flag=0. And their return value handling shall work
(almost) in the same way, or it will already be a bug.

So the !retry_flag special casing can be eliminated.

Cc: Ying Han <yinghan@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/filemap.c |    7 -------
 1 file changed, 7 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -1663,13 +1663,6 @@ no_cached_page:
 	 * meantime, we'll just come back here and read it again.
 	 */
 	if (error >= 0) {
-		/*
-		 * If caller cannot tolerate a retry in the ->fault path
-		 * go back to check the page again.
-		 */
-		if (!retry_flag)
-			goto retry_find;
-
 		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 					vma, &page, retry_flag);
 		if (retry_ret == VM_FAULT_RETRY)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] vfs: reduce page fault retry code
  2009-04-03  8:55       ` [PATCH] vfs: reduce page fault retry code Wu Fengguang
@ 2009-04-03 10:53         ` Wu Fengguang
  0 siblings, 0 replies; 16+ messages in thread
From: Wu Fengguang @ 2009-04-03 10:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, mingo, mikew, rientjes,
	rohitseth, hugh, a.p.zijlstra, hpa, edwintorok, lee.schermerhorn,
	npiggin

On Fri, Apr 03, 2009 at 04:55:03PM +0800, Wu Fengguang wrote:
> find_lock_page_retry() works the same way as find_lock_page()
> when retry_flag=0. And their return value handling shall work
> (almost) in the same way, or it will already be a bug.
> 
> So the !retry_flag special casing can be eliminated.
> 
> Cc: Ying Han <yinghan@google.com>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
>  mm/filemap.c |    7 -------
>  1 file changed, 7 deletions(-)
> 
> --- mm.orig/mm/filemap.c
> +++ mm/mm/filemap.c
> @@ -1663,13 +1663,6 @@ no_cached_page:
>  	 * meantime, we'll just come back here and read it again.
>  	 */
>  	if (error >= 0) {
> -		/*
> -		 * If caller cannot tolerate a retry in the ->fault path
> -		 * go back to check the page again.
> -		 */
> -		if (!retry_flag)
> -			goto retry_find;
> -
>  		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>  					vma, &page, retry_flag);
>  		if (retry_ret == VM_FAULT_RETRY)

In fact I guess we can shrink the code more aggressively. 
The only difference is the extra ra->mmap_miss--, which will
be moved to other place in another planned patch.

Thanks,
Fengguang
---
 mm/filemap.c |   22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -1565,7 +1565,6 @@ int filemap_fault(struct vm_area_struct 
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
 
-retry_find_nopage:
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1615,6 +1614,7 @@ retry_find_nopage:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
+retry_find_retry:
 		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
 				vma, &page, retry_flag);
 		if (retry_ret == VM_FAULT_RETRY)
@@ -1626,7 +1626,6 @@ retry_find_nopage:
 	if (!did_readaround)
 		ra->mmap_miss--;
 
-retry_page_update:
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1662,23 +1661,8 @@ no_cached_page:
 	 * In the unlikely event that someone removed it in the
 	 * meantime, we'll just come back here and read it again.
 	 */
-	if (error >= 0) {
-		/*
-		 * If caller cannot tolerate a retry in the ->fault path
-		 * go back to check the page again.
-		 */
-		if (!retry_flag)
-			goto retry_find;
-
-		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
-					vma, &page, retry_flag);
-		if (retry_ret == VM_FAULT_RETRY)
-			return retry_ret;
-		if (!page)
-			goto retry_find_nopage;
-		else
-			goto retry_page_update;
-	}
+	if (error >= 0)
+		goto retry_find_retry;
 
 	/*
 	 * An error return from page_cache_read can result if the

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2009-04-03 10:53 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-05 19:40 [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY Ying Han
2008-12-06  9:52 ` Török Edwin
2008-12-06  9:55   ` Török Edwin
2008-12-08  1:43     ` Ying Han
2008-12-09 17:57       ` Ying Han
2008-12-09 19:31 ` Andrew Morton
2009-01-26 19:37 ` Andrew Morton
     [not found]   ` <604427e00901261508n7967ea74m3deacd3213c86065@mail.gmail.com>
2009-01-26 23:52     ` Andrew Morton
2009-01-26 23:57       ` Ingo Molnar
2009-01-27  4:34         ` KOSAKI Motohiro
2009-03-31 22:00 ` Andrew Morton
2009-04-01  0:17   ` Ying Han
2009-04-03  8:22   ` [PATCH] vfs: fix find_lock_page_retry() return value parsing Wu Fengguang
2009-04-03  8:35     ` [PATCH v2] " Wu Fengguang
2009-04-03  8:55       ` [PATCH] vfs: reduce page fault retry code Wu Fengguang
2009-04-03 10:53         ` Wu Fengguang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).