[patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
@ 2013-01-07  8:12 Shaohua Li
  2013-01-07 20:06 ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2013-01-07  8:12 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, hughd, riel


Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
this syscall can do swapin prefetch. It has no impact if the memory isn't
swapout.

Signed-off-by: Shaohua Li <shli@fusionio.com>
---
 mm/madvise.c |   94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

Index: linux/mm/madvise.c
===================================================================
--- linux.orig/mm/madvise.c	2013-01-07 15:27:45.064893547 +0800
+++ linux/mm/madvise.c	2013-01-07 15:27:53.636787409 +0800
@@ -16,6 +16,9 @@
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,82 @@ out:
 	return error;
 }
 
+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+	unsigned long end, struct mm_walk *walk)
+{
+	pte_t *orig_pte;
+	struct vm_area_struct *vma = walk->private;
+	unsigned long index;
+
+	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+		return 0;
+
+	for (index = start; index != end; index += PAGE_SIZE) {
+		pte_t pte;
+		swp_entry_t entry;
+		struct page *page;
+		spinlock_t *ptl;
+
+		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
+		pte_unmap_unlock(orig_pte, ptl);
+
+		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+			continue;
+		entry = pte_to_swp_entry(pte);
+		if (unlikely(non_swap_entry(entry)))
+			continue;
+
+		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+								vma, index);
+		if (page)
+			page_cache_release(page);
+	}
+
+	return 0;
+}
+
+static void force_swapin_readahead(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end)
+{
+	struct mm_walk walk = {
+		.mm = vma->vm_mm,
+		.pmd_entry = swapin_walk_pmd_entry,
+		.private = vma,
+	};
+
+	walk_page_range(start, end, &walk);
+
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+}
+
+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct address_space *mapping)
+{
+	pgoff_t index;
+	struct page *page;
+	swp_entry_t swap;
+
+	for (; start < end; start += PAGE_SIZE) {
+		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+		page = find_get_page(mapping, index);
+		if (!radix_tree_exceptional_entry(page)) {
+			if (page)
+				page_cache_release(page);
+			continue;
+		}
+		swap = radix_to_swp_entry(page);
+		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
+								NULL, 0);
+		if (page)
+			page_cache_release(page);
+	}
+
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+}
+
 /*
  * Schedule all required I/O operations.  Do not wait for completion.
  */
@@ -140,6 +219,18 @@ static long madvise_willneed(struct vm_a
 {
 	struct file *file = vma->vm_file;
 
+#ifdef CONFIG_SWAP
+	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+		*prev = vma;
+		if (!file)
+			force_swapin_readahead(vma, start, end);
+		else
+			force_shm_swapin_readahead(vma, start, end,
+						file->f_mapping);
+		return 0;
+	}
+#endif
+
 	if (!file)
 		return -EBADF;
 
@@ -371,6 +462,7 @@ SYSCALL_DEFINE3(madvise, unsigned long,
 	int error = -EINVAL;
 	int write;
 	size_t len;
+	struct blk_plug plug;
 
 #ifdef CONFIG_MEMORY_FAILURE
 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,6 +502,7 @@ SYSCALL_DEFINE3(madvise, unsigned long,
 	if (vma && start > vma->vm_start)
 		prev = vma;
 
+	blk_start_plug(&plug);
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
@@ -445,6 +538,7 @@ SYSCALL_DEFINE3(madvise, unsigned long,
 			vma = find_vma(current->mm, start);
 	}
 out:
+	blk_finish_plug(&plug);
 	if (write)
 		up_write(&current->mm->mmap_sem);
 	else

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-07  8:12 [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch Shaohua Li
@ 2013-01-07 20:06 ` Andrew Morton
  2013-01-08  2:16   ` Wanpeng Li
                     ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Andrew Morton @ 2013-01-07 20:06 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-mm, hughd, riel

On Mon, 7 Jan 2013 16:12:37 +0800
Shaohua Li <shli@kernel.org> wrote:

> 
> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> this syscall can do swapin prefetch. It has no impact if the memory isn't
> swapout.

Seems sensible.

> @@ -140,6 +219,18 @@ static long madvise_willneed(struct vm_a
>  {
>  	struct file *file = vma->vm_file;
>  
> +#ifdef CONFIG_SWAP

It's odd that you put the ifdef in there, but then didn't test it!


From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix

fix CONFIG_SWAP=n build

Cc: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/madvise.c |    2 ++
 1 file changed, 2 insertions(+)

diff -puN mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix mm/madvise.c
--- a/mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix
+++ a/mm/madvise.c
@@ -134,6 +134,7 @@ out:
 	return error;
 }
 
+#ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
 {
@@ -209,6 +210,7 @@ static void force_shm_swapin_readahead(s
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
+#endif		/* CONFIG_SWAP */
 
 /*
  * Schedule all required I/O operations.  Do not wait for completion.
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-07 20:06 ` Andrew Morton
  2013-01-08  2:16   ` Wanpeng Li
@ 2013-01-08  2:16   ` Wanpeng Li
       [not found]   ` <50eb8180.6887320a.3f90.58b0SMTPIN_ADDED_BROKEN@mx.google.com>
  2 siblings, 0 replies; 12+ messages in thread
From: Wanpeng Li @ 2013-01-08  2:16 UTC (permalink / raw)
  To: Andrew Morton, Shaohua Li; +Cc: linux-mm, hughd, riel

On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
>On Mon, 7 Jan 2013 16:12:37 +0800
>Shaohua Li <shli@kernel.org> wrote:
>
>> 
>> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
>> this syscall can do swapin prefetch. It has no impact if the memory isn't
>> swapout.
>
>Seems sensible.

Hi Andrew and Shaohua,

What's the performance in the scenario of serious memory pressure? Since
in this case pages in swap are highly fragmented and cache hit is most
impossible. If WILLNEED path should add a check to skip readahead in
this case since swapin only leads to unnecessary memory allocation. 

Regards,
Wanpeng Li 

>
>> @@ -140,6 +219,18 @@ static long madvise_willneed(struct vm_a
>>  {
>>  	struct file *file = vma->vm_file;
>>  
>> +#ifdef CONFIG_SWAP
>
>It's odd that you put the ifdef in there, but then didn't test it!
>
>
>From: Andrew Morton <akpm@linux-foundation.org>
>Subject: mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix
>
>fix CONFIG_SWAP=n build
>
>Cc: Shaohua Li <shli@fusionio.com>
>Cc: Hugh Dickins <hughd@google.com>
>Cc: Rik van Riel <riel@redhat.com>
>Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>---
>
> mm/madvise.c |    2 ++
> 1 file changed, 2 insertions(+)
>
>diff -puN mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix mm/madvise.c
>--- a/mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix
>+++ a/mm/madvise.c
>@@ -134,6 +134,7 @@ out:
> 	return error;
> }
>
>+#ifdef CONFIG_SWAP
> static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
> 	unsigned long end, struct mm_walk *walk)
> {
>@@ -209,6 +210,7 @@ static void force_shm_swapin_readahead(s
>
> 	lru_add_drain();	/* Push any new pages onto the LRU now */
> }
>+#endif		/* CONFIG_SWAP */
>
> /*
>  * Schedule all required I/O operations.  Do not wait for completion.
>_
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-07 20:06 ` Andrew Morton
@ 2013-01-08  2:16   ` Wanpeng Li
  2013-01-08  2:16   ` Wanpeng Li
       [not found]   ` <50eb8180.6887320a.3f90.58b0SMTPIN_ADDED_BROKEN@mx.google.com>
  2 siblings, 0 replies; 12+ messages in thread
From: Wanpeng Li @ 2013-01-08  2:16 UTC (permalink / raw)
  To: Andrew Morton, Shaohua Li; +Cc: linux-mm, hughd, riel

On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
>On Mon, 7 Jan 2013 16:12:37 +0800
>Shaohua Li <shli@kernel.org> wrote:
>
>> 
>> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
>> this syscall can do swapin prefetch. It has no impact if the memory isn't
>> swapout.
>
>Seems sensible.

Hi Andrew and Shaohua,

What's the performance in the scenario of serious memory pressure? Since
in this case pages in swap are highly fragmented and cache hit is most
impossible. If WILLNEED path should add a check to skip readahead in
this case since swapin only leads to unnecessary memory allocation. 

Regards,
Wanpeng Li 

>
>> @@ -140,6 +219,18 @@ static long madvise_willneed(struct vm_a
>>  {
>>  	struct file *file = vma->vm_file;
>>  
>> +#ifdef CONFIG_SWAP
>
>It's odd that you put the ifdef in there, but then didn't test it!
>
>
>From: Andrew Morton <akpm@linux-foundation.org>
>Subject: mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix
>
>fix CONFIG_SWAP=n build
>
>Cc: Shaohua Li <shli@fusionio.com>
>Cc: Hugh Dickins <hughd@google.com>
>Cc: Rik van Riel <riel@redhat.com>
>Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>---
>
> mm/madvise.c |    2 ++
> 1 file changed, 2 insertions(+)
>
>diff -puN mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix mm/madvise.c
>--- a/mm/madvise.c~mm-make-madvisemadv_willneed-support-swap-file-prefetch-fix
>+++ a/mm/madvise.c
>@@ -134,6 +134,7 @@ out:
> 	return error;
> }
>
>+#ifdef CONFIG_SWAP
> static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
> 	unsigned long end, struct mm_walk *walk)
> {
>@@ -209,6 +210,7 @@ static void force_shm_swapin_readahead(s
>
> 	lru_add_drain();	/* Push any new pages onto the LRU now */
> }
>+#endif		/* CONFIG_SWAP */
>
> /*
>  * Schedule all required I/O operations.  Do not wait for completion.
>_
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
       [not found]   ` <50eb8180.6887320a.3f90.58b0SMTPIN_ADDED_BROKEN@mx.google.com>
@ 2013-01-08  4:26     ` Shaohua Li
  2013-01-08  5:38       ` Minchan Kim
  2013-01-08  8:45       ` Simon Jeons
  0 siblings, 2 replies; 12+ messages in thread
From: Shaohua Li @ 2013-01-08  4:26 UTC (permalink / raw)
  To: Wanpeng Li; +Cc: Andrew Morton, linux-mm, hughd, riel

On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> >On Mon, 7 Jan 2013 16:12:37 +0800
> >Shaohua Li <shli@kernel.org> wrote:
> >
> >> 
> >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> >> swapout.
> >
> >Seems sensible.
> 
> Hi Andrew and Shaohua,
> 
> What's the performance in the scenario of serious memory pressure? Since
> in this case pages in swap are highly fragmented and cache hit is most
> impossible. If WILLNEED path should add a check to skip readahead in
> this case since swapin only leads to unnecessary memory allocation. 

pages in swap are not highly fragmented if you access memory sequentially. In
that case, the pages you accessed will be added to lru list side by side. So if
app does swap prefetch, we can do sequential disk access and merge small
request to big one.

Another advantage is prefetch can drive high disk iodepth.  For sequential
access, this can cause big request. Even for random access, high iodepth has
much better performance especially for SSD.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  4:26     ` Shaohua Li
@ 2013-01-08  5:38       ` Minchan Kim
  2013-01-08  7:32         ` Shaohua Li
  2013-01-08  8:45       ` Simon Jeons
  1 sibling, 1 reply; 12+ messages in thread
From: Minchan Kim @ 2013-01-08  5:38 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

Hi Shaohua,

On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > >On Mon, 7 Jan 2013 16:12:37 +0800
> > >Shaohua Li <shli@kernel.org> wrote:
> > >
> > >> 
> > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > >> swapout.
> > >
> > >Seems sensible.
> > 
> > Hi Andrew and Shaohua,
> > 
> > What's the performance in the scenario of serious memory pressure? Since
> > in this case pages in swap are highly fragmented and cache hit is most
> > impossible. If WILLNEED path should add a check to skip readahead in
> > this case since swapin only leads to unnecessary memory allocation. 
> 
> pages in swap are not highly fragmented if you access memory sequentially. In
> that case, the pages you accessed will be added to lru list side by side. So if
> app does swap prefetch, we can do sequential disk access and merge small
> request to big one.

How can you make sure that the range of WILLNEED was always sequentially accesssed?

> 
> Another advantage is prefetch can drive high disk iodepth.  For sequential

What does it mean 'iodepth'? I failed to grep it in google. :(

> access, this can cause big request. Even for random access, high iodepth has
> much better performance especially for SSD.

So you mean WILLNEED is always good in where both random and sequential in "SSD"?
Then, how about the "Disk"?

Wanpeng's comment makes sense to me so I guess others can have a same question
about this patch. So it would be better to write your rationale in changelog.

> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  5:38       ` Minchan Kim
@ 2013-01-08  7:32         ` Shaohua Li
  2013-01-08  7:54           ` Simon Jeons
  2013-01-08  8:38           ` Minchan Kim
  0 siblings, 2 replies; 12+ messages in thread
From: Shaohua Li @ 2013-01-08  7:32 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

On Tue, Jan 08, 2013 at 02:38:56PM +0900, Minchan Kim wrote:
> Hi Shaohua,
> 
> On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> > On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > > >On Mon, 7 Jan 2013 16:12:37 +0800
> > > >Shaohua Li <shli@kernel.org> wrote:
> > > >
> > > >> 
> > > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > > >> swapout.
> > > >
> > > >Seems sensible.
> > > 
> > > Hi Andrew and Shaohua,
> > > 
> > > What's the performance in the scenario of serious memory pressure? Since
> > > in this case pages in swap are highly fragmented and cache hit is most
> > > impossible. If WILLNEED path should add a check to skip readahead in
> > > this case since swapin only leads to unnecessary memory allocation. 
> > 
> > pages in swap are not highly fragmented if you access memory sequentially. In
> > that case, the pages you accessed will be added to lru list side by side. So if
> > app does swap prefetch, we can do sequential disk access and merge small
> > request to big one.
> 
> How can you make sure that the range of WILLNEED was always sequentially accesssed?

you can't guarantee this even for file access.

> > Another advantage is prefetch can drive high disk iodepth.  For sequential
> 
> What does it mean 'iodepth'? I failed to grep it in google. :(

io depth. How many requests are inflight at a givin time.

> > access, this can cause big request. Even for random access, high iodepth has
> > much better performance especially for SSD.
> 
> So you mean WILLNEED is always good in where both random and sequential in "SSD"?
> Then, how about the "Disk"?

Hmm, even for hard disk, high iodepth random access is faster than single
iodepth access. Today's disk is NCQ disk. But the speedup isn't that
significant like a SSD. For sequential access, both harddisk and SSD have
better performance with higher iodepth.

> Wanpeng's comment makes sense to me so I guess others can have a same question
> about this patch. So it would be better to write your rationale in changelog.

I would, but the question is just like why app wants to prefetch file pages. I
thought it's commonsense. The problem like memory allocation exists in file
prefetch too. The advantages (better IO access, CPU and disk can operate in
parallel and so on) apply for both file and swap prefetch.

prefetch should never be slower non-prefetch. That's another story if app is
very wrong. we definitely don't need consider a wrong app. If the app doesn't
know how to use the API, the app can just don't use it.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  7:32         ` Shaohua Li
@ 2013-01-08  7:54           ` Simon Jeons
  2013-01-08  8:38           ` Minchan Kim
  1 sibling, 0 replies; 12+ messages in thread
From: Simon Jeons @ 2013-01-08  7:54 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Minchan Kim, Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

On Tue, 2013-01-08 at 15:32 +0800, Shaohua Li wrote:
> On Tue, Jan 08, 2013 at 02:38:56PM +0900, Minchan Kim wrote:
> > Hi Shaohua,
> > 
> > On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> > > On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > > > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > > > >On Mon, 7 Jan 2013 16:12:37 +0800
> > > > >Shaohua Li <shli@kernel.org> wrote:
> > > > >
> > > > >> 
> > > > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > > > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > > > >> swapout.
> > > > >
> > > > >Seems sensible.
> > > > 
> > > > Hi Andrew and Shaohua,
> > > > 
> > > > What's the performance in the scenario of serious memory pressure? Since
> > > > in this case pages in swap are highly fragmented and cache hit is most
> > > > impossible. If WILLNEED path should add a check to skip readahead in
> > > > this case since swapin only leads to unnecessary memory allocation. 
> > > 
> > > pages in swap are not highly fragmented if you access memory sequentially. In
> > > that case, the pages you accessed will be added to lru list side by side. So if
> > > app does swap prefetch, we can do sequential disk access and merge small
> > > request to big one.
> > 
> > How can you make sure that the range of WILLNEED was always sequentially accesssed?
> 
> you can't guarantee this even for file access.
> 
> > > Another advantage is prefetch can drive high disk iodepth.  For sequential
> > 
> > What does it mean 'iodepth'? I failed to grep it in google. :(
> 
> io depth. How many requests are inflight at a givin time.
> 
> > > access, this can cause big request. Even for random access, high iodepth has
> > > much better performance especially for SSD.
> > 
> > So you mean WILLNEED is always good in where both random and sequential in "SSD"?
> > Then, how about the "Disk"?
> 
> Hmm, even for hard disk, high iodepth random access is faster than single
> iodepth access. Today's disk is NCQ disk. But the speedup isn't that
> significant like a SSD. For sequential access, both harddisk and SSD have
> better performance with higher iodepth.
> 
> > Wanpeng's comment makes sense to me so I guess others can have a same question
> > about this patch. So it would be better to write your rationale in changelog.
> 
> I would, but the question is just like why app wants to prefetch file pages. I
> thought it's commonsense. The problem like memory allocation exists in file

It just depends on how you understand it. :)

> prefetch too. The advantages (better IO access, CPU and disk can operate in
> parallel and so on) apply for both file and swap prefetch.
> 
> prefetch should never be slower non-prefetch. That's another story if app is
> very wrong. we definitely don't need consider a wrong app. If the app doesn't
> know how to use the API, the app can just don't use it.
> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  7:32         ` Shaohua Li
  2013-01-08  7:54           ` Simon Jeons
@ 2013-01-08  8:38           ` Minchan Kim
  2013-01-08  9:13             ` Shaohua Li
  1 sibling, 1 reply; 12+ messages in thread
From: Minchan Kim @ 2013-01-08  8:38 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

On Tue, Jan 08, 2013 at 03:32:29PM +0800, Shaohua Li wrote:
> On Tue, Jan 08, 2013 at 02:38:56PM +0900, Minchan Kim wrote:
> > Hi Shaohua,
> > 
> > On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> > > On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > > > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > > > >On Mon, 7 Jan 2013 16:12:37 +0800
> > > > >Shaohua Li <shli@kernel.org> wrote:
> > > > >
> > > > >> 
> > > > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > > > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > > > >> swapout.
> > > > >
> > > > >Seems sensible.
> > > > 
> > > > Hi Andrew and Shaohua,
> > > > 
> > > > What's the performance in the scenario of serious memory pressure? Since
> > > > in this case pages in swap are highly fragmented and cache hit is most
> > > > impossible. If WILLNEED path should add a check to skip readahead in
> > > > this case since swapin only leads to unnecessary memory allocation. 
> > > 
> > > pages in swap are not highly fragmented if you access memory sequentially. In
> > > that case, the pages you accessed will be added to lru list side by side. So if
> > > app does swap prefetch, we can do sequential disk access and merge small
> > > request to big one.
> > 
> > How can you make sure that the range of WILLNEED was always sequentially accesssed?
> 
> you can't guarantee this even for file access.

Indeed.

> 
> > > Another advantage is prefetch can drive high disk iodepth.  For sequential
> > 
> > What does it mean 'iodepth'? I failed to grep it in google. :(
> 
> io depth. How many requests are inflight at a givin time.

Thanks for the info!

> 
> > > access, this can cause big request. Even for random access, high iodepth has
> > > much better performance especially for SSD.
> > 
> > So you mean WILLNEED is always good in where both random and sequential in "SSD"?
> > Then, how about the "Disk"?
> 
> Hmm, even for hard disk, high iodepth random access is faster than single
> iodepth access. Today's disk is NCQ disk. But the speedup isn't that
> significant like a SSD. For sequential access, both harddisk and SSD have
> better performance with higher iodepth.
> 
> > Wanpeng's comment makes sense to me so I guess others can have a same question
> > about this patch. So it would be better to write your rationale in changelog.
> 
> I would, but the question is just like why app wants to prefetch file pages. I
> thought it's commonsense. The problem like memory allocation exists in file
> prefetch too. The advantages (better IO access, CPU and disk can operate in
> parallel and so on) apply for both file and swap prefetch.

Agreed. But I have a question about semantic of madvise(DONTNEED) of anon vma.
If Linux start to support it for anon, user can misunderstand it following as.

User might think we start to use anonymous pages in that range soon so he
gives the hint to kernel to map all pages of the range to page table in advance.
(ie, pre page fault like MAP_POPULATE) and if one of the page might be
swapped out, readahead it. What do you think about it?
For clarification, it would be better to add man page description with Ccing
man page maintainer.

> 
> prefetch should never be slower non-prefetch. That's another story if app is
> very wrong. we definitely don't need consider a wrong app. If the app doesn't
> know how to use the API, the app can just don't use it.

Fair enough.

> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  4:26     ` Shaohua Li
  2013-01-08  5:38       ` Minchan Kim
@ 2013-01-08  8:45       ` Simon Jeons
  1 sibling, 0 replies; 12+ messages in thread
From: Simon Jeons @ 2013-01-08  8:45 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

On Tue, 2013-01-08 at 12:26 +0800, Shaohua Li wrote:
> On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > >On Mon, 7 Jan 2013 16:12:37 +0800
> > >Shaohua Li <shli@kernel.org> wrote:
> > >
> > >> 
> > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > >> swapout.
> > >
> > >Seems sensible.
> > 
> > Hi Andrew and Shaohua,
> > 
> > What's the performance in the scenario of serious memory pressure? Since
> > in this case pages in swap are highly fragmented and cache hit is most
> > impossible. If WILLNEED path should add a check to skip readahead in
> > this case since swapin only leads to unnecessary memory allocation. 
> 
> pages in swap are not highly fragmented if you access memory sequentially. In

In the scenario of serious memory pressure, pages swapin and swapout
frequently, how to guarantee swap area is not fragmented?

> that case, the pages you accessed will be added to lru list side by side. So if
> app does swap prefetch, we can do sequential disk access and merge small
> request to big one.
> 
> Another advantage is prefetch can drive high disk iodepth.  For sequential
> access, this can cause big request. Even for random access, high iodepth has
> much better performance especially for SSD.
> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  8:38           ` Minchan Kim
@ 2013-01-08  9:13             ` Shaohua Li
  2013-01-09  7:28               ` Minchan Kim
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2013-01-08  9:13 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel

On Tue, Jan 08, 2013 at 05:38:53PM +0900, Minchan Kim wrote:
> On Tue, Jan 08, 2013 at 03:32:29PM +0800, Shaohua Li wrote:
> > On Tue, Jan 08, 2013 at 02:38:56PM +0900, Minchan Kim wrote:
> > > Hi Shaohua,
> > > 
> > > On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> > > > On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > > > > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > > > > >On Mon, 7 Jan 2013 16:12:37 +0800
> > > > > >Shaohua Li <shli@kernel.org> wrote:
> > > > > >
> > > > > >> 
> > > > > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > > > > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > > > > >> swapout.
> > > > > >
> > > > > >Seems sensible.
> > > > > 
> > > > > Hi Andrew and Shaohua,
> > > > > 
> > > > > What's the performance in the scenario of serious memory pressure? Since
> > > > > in this case pages in swap are highly fragmented and cache hit is most
> > > > > impossible. If WILLNEED path should add a check to skip readahead in
> > > > > this case since swapin only leads to unnecessary memory allocation. 
> > > > 
> > > > pages in swap are not highly fragmented if you access memory sequentially. In
> > > > that case, the pages you accessed will be added to lru list side by side. So if
> > > > app does swap prefetch, we can do sequential disk access and merge small
> > > > request to big one.
> > > 
> > > How can you make sure that the range of WILLNEED was always sequentially accesssed?
> > 
> > you can't guarantee this even for file access.
> 
> Indeed.
> 
> > 
> > > > Another advantage is prefetch can drive high disk iodepth.  For sequential
> > > 
> > > What does it mean 'iodepth'? I failed to grep it in google. :(
> > 
> > io depth. How many requests are inflight at a givin time.
> 
> Thanks for the info!
> 
> > 
> > > > access, this can cause big request. Even for random access, high iodepth has
> > > > much better performance especially for SSD.
> > > 
> > > So you mean WILLNEED is always good in where both random and sequential in "SSD"?
> > > Then, how about the "Disk"?
> > 
> > Hmm, even for hard disk, high iodepth random access is faster than single
> > iodepth access. Today's disk is NCQ disk. But the speedup isn't that
> > significant like a SSD. For sequential access, both harddisk and SSD have
> > better performance with higher iodepth.
> > 
> > > Wanpeng's comment makes sense to me so I guess others can have a same question
> > > about this patch. So it would be better to write your rationale in changelog.
> > 
> > I would, but the question is just like why app wants to prefetch file pages. I
> > thought it's commonsense. The problem like memory allocation exists in file
> > prefetch too. The advantages (better IO access, CPU and disk can operate in
> > parallel and so on) apply for both file and swap prefetch.
> 
> Agreed. But I have a question about semantic of madvise(DONTNEED) of anon vma.
> If Linux start to support it for anon, user can misunderstand it following as.
> 
> User might think we start to use anonymous pages in that range soon so he
> gives the hint to kernel to map all pages of the range to page table in advance.
> (ie, pre page fault like MAP_POPULATE) and if one of the page might be
> swapped out, readahead it. What do you think about it?
> For clarification, it would be better to add man page description with Ccing
> man page maintainer.

there is no confusion if the page exists or swapped. I thought what you are are
thinking about is the page isn't populated yet. The manpage declaims WILLNEED
"it might be a good idea to read some pages ahead." This sounds clear this
isn't to populate memory and matches what we did. But I'm not sure what's the
precise description.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch
  2013-01-08  9:13             ` Shaohua Li
@ 2013-01-09  7:28               ` Minchan Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2013-01-09  7:28 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Wanpeng Li, Andrew Morton, linux-mm, hughd, riel, Johannes Weiner,
	mtk.manpages

On Tue, Jan 08, 2013 at 05:13:24PM +0800, Shaohua Li wrote:
> On Tue, Jan 08, 2013 at 05:38:53PM +0900, Minchan Kim wrote:
> > On Tue, Jan 08, 2013 at 03:32:29PM +0800, Shaohua Li wrote:
> > > On Tue, Jan 08, 2013 at 02:38:56PM +0900, Minchan Kim wrote:
> > > > Hi Shaohua,
> > > > 
> > > > On Tue, Jan 08, 2013 at 12:26:09PM +0800, Shaohua Li wrote:
> > > > > On Tue, Jan 08, 2013 at 10:16:07AM +0800, Wanpeng Li wrote:
> > > > > > On Mon, Jan 07, 2013 at 12:06:30PM -0800, Andrew Morton wrote:
> > > > > > >On Mon, 7 Jan 2013 16:12:37 +0800
> > > > > > >Shaohua Li <shli@kernel.org> wrote:
> > > > > > >
> > > > > > >> 
> > > > > > >> Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout,
> > > > > > >> this syscall can do swapin prefetch. It has no impact if the memory isn't
> > > > > > >> swapout.
> > > > > > >
> > > > > > >Seems sensible.
> > > > > > 
> > > > > > Hi Andrew and Shaohua,
> > > > > > 
> > > > > > What's the performance in the scenario of serious memory pressure? Since
> > > > > > in this case pages in swap are highly fragmented and cache hit is most
> > > > > > impossible. If WILLNEED path should add a check to skip readahead in
> > > > > > this case since swapin only leads to unnecessary memory allocation. 
> > > > > 
> > > > > pages in swap are not highly fragmented if you access memory sequentially. In
> > > > > that case, the pages you accessed will be added to lru list side by side. So if
> > > > > app does swap prefetch, we can do sequential disk access and merge small
> > > > > request to big one.
> > > > 
> > > > How can you make sure that the range of WILLNEED was always sequentially accesssed?
> > > 
> > > you can't guarantee this even for file access.
> > 
> > Indeed.
> > 
> > > 
> > > > > Another advantage is prefetch can drive high disk iodepth.  For sequential
> > > > 
> > > > What does it mean 'iodepth'? I failed to grep it in google. :(
> > > 
> > > io depth. How many requests are inflight at a givin time.
> > 
> > Thanks for the info!
> > 
> > > 
> > > > > access, this can cause big request. Even for random access, high iodepth has
> > > > > much better performance especially for SSD.
> > > > 
> > > > So you mean WILLNEED is always good in where both random and sequential in "SSD"?
> > > > Then, how about the "Disk"?
> > > 
> > > Hmm, even for hard disk, high iodepth random access is faster than single
> > > iodepth access. Today's disk is NCQ disk. But the speedup isn't that
> > > significant like a SSD. For sequential access, both harddisk and SSD have
> > > better performance with higher iodepth.
> > > 
> > > > Wanpeng's comment makes sense to me so I guess others can have a same question
> > > > about this patch. So it would be better to write your rationale in changelog.
> > > 
> > > I would, but the question is just like why app wants to prefetch file pages. I
> > > thought it's commonsense. The problem like memory allocation exists in file
> > > prefetch too. The advantages (better IO access, CPU and disk can operate in
> > > parallel and so on) apply for both file and swap prefetch.
> > 
> > Agreed. But I have a question about semantic of madvise(DONTNEED) of anon vma.
> > If Linux start to support it for anon, user can misunderstand it following as.
> > 
> > User might think we start to use anonymous pages in that range soon so he
> > gives the hint to kernel to map all pages of the range to page table in advance.
> > (ie, pre page fault like MAP_POPULATE) and if one of the page might be
> > swapped out, readahead it. What do you think about it?
> > For clarification, it would be better to add man page description with Ccing
> > man page maintainer.
> 
> there is no confusion if the page exists or swapped. I thought what you are are
> thinking about is the page isn't populated yet. The manpage declaims WILLNEED
> "it might be a good idea to read some pages ahead." This sounds clear this
> isn't to populate memory and matches what we did. But I'm not sure what's the
> precise description.

Anyway, you are adding new feature.
For merging, we need a real practice scenario, gain and exact semantics for man
pages. I don't know why current Linux man pages is very poor about WILLNEED.
If it doesn't give enough information for user to expect system's behavior,
anyone doesn't want to use it.
POV perforance, Ccing Johannes because he tried similar long time ago so 
he might have a comment.

> 
> Thanks,
> Shaohua
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-01-09  7:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-07  8:12 [patch]mm: make madvise(MADV_WILLNEED) support swap file prefetch Shaohua Li
2013-01-07 20:06 ` Andrew Morton
2013-01-08  2:16   ` Wanpeng Li
2013-01-08  2:16   ` Wanpeng Li
     [not found]   ` <50eb8180.6887320a.3f90.58b0SMTPIN_ADDED_BROKEN@mx.google.com>
2013-01-08  4:26     ` Shaohua Li
2013-01-08  5:38       ` Minchan Kim
2013-01-08  7:32         ` Shaohua Li
2013-01-08  7:54           ` Simon Jeons
2013-01-08  8:38           ` Minchan Kim
2013-01-08  9:13             ` Shaohua Li
2013-01-09  7:28               ` Minchan Kim
2013-01-08  8:45       ` Simon Jeons

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).