* [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-17 21:09 ` Shaohua Li
0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-17 21:09 UTC (permalink / raw)
To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg
Cc: danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Andrew Morton,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski
There was a similar patch posted before, but it doesn't get merged. I'd like
to try again if there are more discussions.
http://marc.info/?l=linux-mm&m=141230769431688&w=2
mremap can be used to accelerate realloc. The problem is mremap will
punch a hole in original VMA, which makes specific memory allocator
unable to utilize it. Jemalloc is an example. It manages memory in 4M
chunks. mremap a range of the chunk will punch a hole, which other
mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
can't handle it.
This patch adds a new flag for mremap. With it, mremap will not punch the
hole. page tables of original vma will be zapped in the same way, but
vma is still there. That is original vma will look like a vma without
pagefault. Behavior of new vma isn't changed.
For private vma, accessing original vma will cause
page fault and just like the address of the vma has never been accessed.
So for anonymous, new page/zero page will be fault in. For file mapping,
new page will be allocated with file reading for cow, or pagefault will
use existing page cache.
For shared vma, original and new vma will map to the same file. We can
optimize this without zaping original vma's page table in this case, but
this patch doesn't do it yet.
Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
for special vma might not able to handle pagefault for mremap'd area.
The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
mremap.
Cc: Rik van Riel <riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Mel Gorman <mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>
Cc: Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
include/uapi/linux/mman.h | 1 +
mm/mremap.c | 97 ++++++++++++++++++++++++++++++++---------------
2 files changed, 67 insertions(+), 31 deletions(-)
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..9ee9a15 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
+#define MREMAP_NOHOLE 4
#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 38df67b..4771fd1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -234,7 +234,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr, bool *locked,
+ bool nohole)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -290,7 +291,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT) {
+ if ((vm_flags & VM_ACCOUNT) && !nohole) {
vma->vm_flags &= ~VM_ACCOUNT;
excess = vma->vm_end - vma->vm_start - old_len;
if (old_addr > vma->vm_start &&
@@ -310,11 +311,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
hiwater_vm = mm->hiwater_vm;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
- if (do_munmap(mm, old_addr, old_len) < 0) {
+ if (!nohole && do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+
+ if (nohole && (new_addr & ~PAGE_MASK)) {
+ /* caller will unaccount */
+ vma->vm_flags &= ~VM_ACCOUNT;
+ do_munmap(mm, old_addr, old_len);
+ }
+
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -332,14 +340,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long *p)
+static unsigned long validate_vma_and_charge(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long *p,
+ bool nohole)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (!vma || vma->vm_start > addr)
- goto Efault;
+ unsigned long diff;
if (is_vm_hugetlb_page(vma))
goto Einval;
@@ -348,6 +355,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
goto Efault;
+ if (nohole && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+ goto Einval;
+
/* Need to be careful about a growing mapping */
if (new_len > old_len) {
unsigned long pgoff;
@@ -360,39 +370,45 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
goto Einval;
}
+ if (nohole)
+ diff = new_len;
+ else
+ diff = new_len - old_len;
+
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
- locked += new_len - old_len;
+ locked += diff;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto Eagain;
}
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, diff >> PAGE_SHIFT))
goto Enomem;
if (vma->vm_flags & VM_ACCOUNT) {
- unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long charged = diff >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
goto Efault;
*p = charged;
}
- return vma;
+ return 0;
Efault: /* very odd choice for most of the cases, but... */
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
Einval:
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
Enomem:
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
Eagain:
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+ unsigned long new_addr, unsigned long new_len, bool *locked,
+ bool nohole)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -420,17 +436,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
goto out;
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len);
- if (ret && old_len != new_len)
- goto out;
+ if (!nohole) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ }
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, &charged);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr) {
+ ret = -EFAULT;
goto out;
}
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len, &charged,
+ nohole);
+ if (ret)
+ goto out;
map_flags = MAP_FIXED;
if (vma->vm_flags & VM_MAYSHARE)
@@ -442,7 +464,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (ret & ~PAGE_MASK)
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, nohole);
if (!(ret & ~PAGE_MASK))
goto out;
out1:
@@ -481,8 +503,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ bool nohole = flags & MREMAP_NOHOLE;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_NOHOLE))
return ret;
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ &locked, nohole);
goto out;
}
@@ -526,9 +549,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, &charged);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr) {
+ ret = -EFAULT;
goto out;
}
@@ -539,6 +562,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+ &charged, false);
+ if (ret) {
+ BUG_ON(charged != 0);
+ goto out;
+ }
if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
ret = -ENOMEM;
@@ -556,6 +585,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+ &charged, nohole);
+ if (ret)
+ goto out;
+
/*
* We weren't able to just expand or shrink the area,
* we need to create a new one and move it..
@@ -575,7 +609,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked,
+ nohole);
}
out:
if (ret & ~PAGE_MASK)
--
1.8.1
^ permalink raw reply related [flat|nested] 44+ messages in thread* [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-17 21:09 ` Shaohua Li
0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-17 21:09 UTC (permalink / raw)
To: linux-mm
Cc: danielmicay, linux-api, Rik van Riel, Andrew Morton, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
There was a similar patch posted before, but it doesn't get merged. I'd like
to try again if there are more discussions.
http://marc.info/?l=linux-mm&m=141230769431688&w=2
mremap can be used to accelerate realloc. The problem is mremap will
punch a hole in original VMA, which makes specific memory allocator
unable to utilize it. Jemalloc is an example. It manages memory in 4M
chunks. mremap a range of the chunk will punch a hole, which other
mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
can't handle it.
This patch adds a new flag for mremap. With it, mremap will not punch the
hole. page tables of original vma will be zapped in the same way, but
vma is still there. That is original vma will look like a vma without
pagefault. Behavior of new vma isn't changed.
For private vma, accessing original vma will cause
page fault and just like the address of the vma has never been accessed.
So for anonymous, new page/zero page will be fault in. For file mapping,
new page will be allocated with file reading for cow, or pagefault will
use existing page cache.
For shared vma, original and new vma will map to the same file. We can
optimize this without zaping original vma's page table in this case, but
this patch doesn't do it yet.
Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
for special vma might not able to handle pagefault for mremap'd area.
The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
mremap.
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Shaohua Li <shli@fb.com>
---
include/uapi/linux/mman.h | 1 +
mm/mremap.c | 97 ++++++++++++++++++++++++++++++++---------------
2 files changed, 67 insertions(+), 31 deletions(-)
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..9ee9a15 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
+#define MREMAP_NOHOLE 4
#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 38df67b..4771fd1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -234,7 +234,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr, bool *locked,
+ bool nohole)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -290,7 +291,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT) {
+ if ((vm_flags & VM_ACCOUNT) && !nohole) {
vma->vm_flags &= ~VM_ACCOUNT;
excess = vma->vm_end - vma->vm_start - old_len;
if (old_addr > vma->vm_start &&
@@ -310,11 +311,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
hiwater_vm = mm->hiwater_vm;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
- if (do_munmap(mm, old_addr, old_len) < 0) {
+ if (!nohole && do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+
+ if (nohole && (new_addr & ~PAGE_MASK)) {
+ /* caller will unaccount */
+ vma->vm_flags &= ~VM_ACCOUNT;
+ do_munmap(mm, old_addr, old_len);
+ }
+
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -332,14 +340,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long *p)
+static unsigned long validate_vma_and_charge(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long *p,
+ bool nohole)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (!vma || vma->vm_start > addr)
- goto Efault;
+ unsigned long diff;
if (is_vm_hugetlb_page(vma))
goto Einval;
@@ -348,6 +355,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
goto Efault;
+ if (nohole && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+ goto Einval;
+
/* Need to be careful about a growing mapping */
if (new_len > old_len) {
unsigned long pgoff;
@@ -360,39 +370,45 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
goto Einval;
}
+ if (nohole)
+ diff = new_len;
+ else
+ diff = new_len - old_len;
+
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
- locked += new_len - old_len;
+ locked += diff;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto Eagain;
}
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, diff >> PAGE_SHIFT))
goto Enomem;
if (vma->vm_flags & VM_ACCOUNT) {
- unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long charged = diff >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
goto Efault;
*p = charged;
}
- return vma;
+ return 0;
Efault: /* very odd choice for most of the cases, but... */
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
Einval:
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
Enomem:
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
Eagain:
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+ unsigned long new_addr, unsigned long new_len, bool *locked,
+ bool nohole)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -420,17 +436,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
goto out;
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len);
- if (ret && old_len != new_len)
- goto out;
+ if (!nohole) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ }
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, &charged);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr) {
+ ret = -EFAULT;
goto out;
}
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len, &charged,
+ nohole);
+ if (ret)
+ goto out;
map_flags = MAP_FIXED;
if (vma->vm_flags & VM_MAYSHARE)
@@ -442,7 +464,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (ret & ~PAGE_MASK)
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, nohole);
if (!(ret & ~PAGE_MASK))
goto out;
out1:
@@ -481,8 +503,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ bool nohole = flags & MREMAP_NOHOLE;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_NOHOLE))
return ret;
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ &locked, nohole);
goto out;
}
@@ -526,9 +549,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, &charged);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr) {
+ ret = -EFAULT;
goto out;
}
@@ -539,6 +562,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+ &charged, false);
+ if (ret) {
+ BUG_ON(charged != 0);
+ goto out;
+ }
if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
ret = -ENOMEM;
@@ -556,6 +585,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
+ ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+ &charged, nohole);
+ if (ret)
+ goto out;
+
/*
* We weren't able to just expand or shrink the area,
* we need to create a new one and move it..
@@ -575,7 +609,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked,
+ nohole);
}
out:
if (ret & ~PAGE_MASK)
--
1.8.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 44+ messages in thread[parent not found: <deaa4139de6e6422a0cec1e3282553aed3495e94.1426626497.git.shli-b10kYP2dOMg@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-17 21:09 ` Shaohua Li
@ 2015-03-18 22:31 ` Andrew Morton
-1 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-18 22:31 UTC (permalink / raw)
To: Shaohua Li
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> There was a similar patch posted before, but it doesn't get merged. I'd like
> to try again if there are more discussions.
> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>
> mremap can be used to accelerate realloc. The problem is mremap will
> punch a hole in original VMA, which makes specific memory allocator
> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> chunks. mremap a range of the chunk will punch a hole, which other
> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> can't handle it.
Daniel's changelog had additional details regarding the userspace
allocators' behaviour. It would be best to incorporate that into your
changelog.
Daniel also had microbenchmark testing results for glibc and jemalloc.
Can you please do this?
I'm not seeing any testing results for tcmalloc and I'm not seeing
confirmation that this patch will be useful for tcmalloc. Has anyone
tried it, or sought input from tcmalloc developers?
> This patch adds a new flag for mremap. With it, mremap will not punch the
> hole. page tables of original vma will be zapped in the same way, but
> vma is still there. That is original vma will look like a vma without
> pagefault. Behavior of new vma isn't changed.
>
> For private vma, accessing original vma will cause
> page fault and just like the address of the vma has never been accessed.
> So for anonymous, new page/zero page will be fault in. For file mapping,
> new page will be allocated with file reading for cow, or pagefault will
> use existing page cache.
>
> For shared vma, original and new vma will map to the same file. We can
> optimize this without zaping original vma's page table in this case, but
> this patch doesn't do it yet.
>
> Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> for special vma might not able to handle pagefault for mremap'd area.
> The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> mremap.
At some point (preferably an early point) we'd like a manpage update
and a cc: to linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org please.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-18 22:31 ` Andrew Morton
0 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-18 22:31 UTC (permalink / raw)
To: Shaohua Li
Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> There was a similar patch posted before, but it doesn't get merged. I'd like
> to try again if there are more discussions.
> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>
> mremap can be used to accelerate realloc. The problem is mremap will
> punch a hole in original VMA, which makes specific memory allocator
> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> chunks. mremap a range of the chunk will punch a hole, which other
> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> can't handle it.
Daniel's changelog had additional details regarding the userspace
allocators' behaviour. It would be best to incorporate that into your
changelog.
Daniel also had microbenchmark testing results for glibc and jemalloc.
Can you please do this?
I'm not seeing any testing results for tcmalloc and I'm not seeing
confirmation that this patch will be useful for tcmalloc. Has anyone
tried it, or sought input from tcmalloc developers?
> This patch adds a new flag for mremap. With it, mremap will not punch the
> hole. page tables of original vma will be zapped in the same way, but
> vma is still there. That is original vma will look like a vma without
> pagefault. Behavior of new vma isn't changed.
>
> For private vma, accessing original vma will cause
> page fault and just like the address of the vma has never been accessed.
> So for anonymous, new page/zero page will be fault in. For file mapping,
> new page will be allocated with file reading for cow, or pagefault will
> use existing page cache.
>
> For shared vma, original and new vma will map to the same file. We can
> optimize this without zaping original vma's page table in this case, but
> this patch doesn't do it yet.
>
> Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> for special vma might not able to handle pagefault for mremap'd area.
> The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> mremap.
At some point (preferably an early point) we'd like a manpage update
and a cc: to linux-man@vger.kernel.org please.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
[parent not found: <20150318153100.5658b741277f3717b52e42d9-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-18 22:31 ` Andrew Morton
@ 2015-03-19 5:08 ` Shaohua Li
-1 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 5:08 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, Mar 18, 2015 at 03:31:00PM -0700, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
>
> > There was a similar patch posted before, but it doesn't get merged. I'd like
> > to try again if there are more discussions.
> > http://marc.info/?l=linux-mm&m=141230769431688&w=2
> >
> > mremap can be used to accelerate realloc. The problem is mremap will
> > punch a hole in original VMA, which makes specific memory allocator
> > unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > chunks. mremap a range of the chunk will punch a hole, which other
> > mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > can't handle it.
>
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour. It would be best to incorporate that into your
> changelog.
I'll extract some from his changelog in next post
> Daniel also had microbenchmark testing results for glibc and jemalloc.
> Can you please do this?
I run Daniel's microbenchmark too, and not surprise the result is
similar:
glibc: 32.82
jemalloc: 70.35
jemalloc+mremap: 33.01
tcmalloc: 68.81
but tcmalloc doesn't support mremap currently, so I cant test it.
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc. Has anyone
> tried it, or sought input from tcmalloc developers?
>
> > This patch adds a new flag for mremap. With it, mremap will not punch the
> > hole. page tables of original vma will be zapped in the same way, but
> > vma is still there. That is original vma will look like a vma without
> > pagefault. Behavior of new vma isn't changed.
> >
> > For private vma, accessing original vma will cause
> > page fault and just like the address of the vma has never been accessed.
> > So for anonymous, new page/zero page will be fault in. For file mapping,
> > new page will be allocated with file reading for cow, or pagefault will
> > use existing page cache.
> >
> > For shared vma, original and new vma will map to the same file. We can
> > optimize this without zaping original vma's page table in this case, but
> > this patch doesn't do it yet.
> >
> > Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> > for special vma might not able to handle pagefault for mremap'd area.
> > The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> > mremap.
>
> At some point (preferably an early point) we'd like a manpage update
> and a cc: to linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org please.
ok, will add in next post.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19 5:08 ` Shaohua Li
0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 5:08 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, Mar 18, 2015 at 03:31:00PM -0700, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
>
> > There was a similar patch posted before, but it doesn't get merged. I'd like
> > to try again if there are more discussions.
> > http://marc.info/?l=linux-mm&m=141230769431688&w=2
> >
> > mremap can be used to accelerate realloc. The problem is mremap will
> > punch a hole in original VMA, which makes specific memory allocator
> > unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > chunks. mremap a range of the chunk will punch a hole, which other
> > mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > can't handle it.
>
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour. It would be best to incorporate that into your
> changelog.
I'll extract some from his changelog in next post
> Daniel also had microbenchmark testing results for glibc and jemalloc.
> Can you please do this?
I run Daniel's microbenchmark too, and not surprise the result is
similar:
glibc: 32.82
jemalloc: 70.35
jemalloc+mremap: 33.01
tcmalloc: 68.81
but tcmalloc doesn't support mremap currently, so I cant test it.
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc. Has anyone
> tried it, or sought input from tcmalloc developers?
>
> > This patch adds a new flag for mremap. With it, mremap will not punch the
> > hole. page tables of original vma will be zapped in the same way, but
> > vma is still there. That is original vma will look like a vma without
> > pagefault. Behavior of new vma isn't changed.
> >
> > For private vma, accessing original vma will cause
> > page fault and just like the address of the vma has never been accessed.
> > So for anonymous, new page/zero page will be fault in. For file mapping,
> > new page will be allocated with file reading for cow, or pagefault will
> > use existing page cache.
> >
> > For shared vma, original and new vma will map to the same file. We can
> > optimize this without zaping original vma's page table in this case, but
> > this patch doesn't do it yet.
> >
> > Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> > for special vma might not able to handle pagefault for mremap'd area.
> > The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> > mremap.
>
> At some point (preferably an early point) we'd like a manpage update
> and a cc: to linux-man@vger.kernel.org please.
ok, will add in next post.
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
[parent not found: <20150319050826.GA1591708-XA4dbxeItU7BTsLV8vAZyg2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-19 5:08 ` Shaohua Li
@ 2015-03-19 5:22 ` Andrew Morton
-1 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-19 5:22 UTC (permalink / raw)
To: Shaohua Li
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > Can you please do this?
>
> I run Daniel's microbenchmark too, and not surprise the result is
> similar:
> glibc: 32.82
> jemalloc: 70.35
> jemalloc+mremap: 33.01
> tcmalloc: 68.81
>
> but tcmalloc doesn't support mremap currently, so I cant test it.
But Daniel's changelog implies strongly that tcmalloc would benefit
from his patch. Was that inaccurate or is this a difference between
his patch and yours?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19 5:22 ` Andrew Morton
0 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-19 5:22 UTC (permalink / raw)
To: Shaohua Li
Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli@fb.com> wrote:
> > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > Can you please do this?
>
> I run Daniel's microbenchmark too, and not surprise the result is
> similar:
> glibc: 32.82
> jemalloc: 70.35
> jemalloc+mremap: 33.01
> tcmalloc: 68.81
>
> but tcmalloc doesn't support mremap currently, so I cant test it.
But Daniel's changelog implies strongly that tcmalloc would benefit
from his patch. Was that inaccurate or is this a difference between
his patch and yours?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
[parent not found: <20150318222246.bc608dd0.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-19 5:22 ` Andrew Morton
@ 2015-03-19 16:38 ` Shaohua Li
-1 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 16:38 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, Mar 18, 2015 at 10:22:46PM -0700, Andrew Morton wrote:
> On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
>
> > > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > > Can you please do this?
> >
> > I run Daniel's microbenchmark too, and not surprise the result is
> > similar:
> > glibc: 32.82
> > jemalloc: 70.35
> > jemalloc+mremap: 33.01
> > tcmalloc: 68.81
> >
> > but tcmalloc doesn't support mremap currently, so I cant test it.
>
> But Daniel's changelog implies strongly that tcmalloc would benefit
> from his patch. Was that inaccurate or is this a difference between
> his patch and yours?
There is no big difference, except I fixed some issues. Daniel didn't
post data for tcmalloc, I suppose it's potential mremap can make
tcmalloc faster too, but Daniel can clarify.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19 16:38 ` Shaohua Li
0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 16:38 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski
On Wed, Mar 18, 2015 at 10:22:46PM -0700, Andrew Morton wrote:
> On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli@fb.com> wrote:
>
> > > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > > Can you please do this?
> >
> > I run Daniel's microbenchmark too, and not surprise the result is
> > similar:
> > glibc: 32.82
> > jemalloc: 70.35
> > jemalloc+mremap: 33.01
> > tcmalloc: 68.81
> >
> > but tcmalloc doesn't support mremap currently, so I cant test it.
>
> But Daniel's changelog implies strongly that tcmalloc would benefit
> from his patch. Was that inaccurate or is this a difference between
> his patch and yours?
There is no big difference, except I fixed some issues. Daniel didn't
post data for tcmalloc, I suppose it's potential mremap can make
tcmalloc faster too, but Daniel can clarify.
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-18 22:31 ` Andrew Morton
@ 2015-03-19 5:34 ` Daniel Micay
-1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-19 5:34 UTC (permalink / raw)
To: Andrew Morton, Shaohua Li
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
Rik van Riel, Hugh Dickins, Mel Gorman, Johannes Weiner,
Michal Hocko, Andy Lutomirski, Aliaksey Kandratsenka
[-- Attachment #1: Type: text/plain, Size: 2780 bytes --]
On 18/03/15 06:31 PM, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
>
>> There was a similar patch posted before, but it doesn't get merged. I'd like
>> to try again if there are more discussions.
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>>
>> mremap can be used to accelerate realloc. The problem is mremap will
>> punch a hole in original VMA, which makes specific memory allocator
>> unable to utilize it. Jemalloc is an example. It manages memory in 4M
>> chunks. mremap a range of the chunk will punch a hole, which other
>> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
>> can't handle it.
>
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour. It would be best to incorporate that into your
> changelog.
>
> Daniel also had microbenchmark testing results for glibc and jemalloc.
> Can you please do this?
>
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc. Has anyone
> tried it, or sought input from tcmalloc developers?
TCMalloc and jemalloc are currently equally slow in this benchmark, as
neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
the currently most active TCMalloc developer so they can give input
into whether this patch would let them use it.
#include <string.h>
#include <stdlib.h>
int main(void) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4 * 1024 * 1024; size < 1024 * 1024 * 1024; size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
If an outer loop is wrapped around this, jemalloc's master branch will
at least be able to do in-place resizing for everything after the 1st
run, but that's much rarer in the real world where there are many users
of the allocator. The lack of mremap still ends up hurting a lot.
FWIW, jemalloc is now the default allocator on Android so there are an
increasing number of Linux machines unable to leverage mremap. It could
be worked around by attempting to use an mmap hint to get the memory
back, but that can fail as it's a race with the other threads and that
leads increases fragmentation over the long term.
It's especially problematic if a large range of virtual memory is
reserved and divided up between per-CPU arenas for concurrency, but
only garbage collectors tend to do stuff like this at the moment. This
can still be dealt with by checking internal uses of mmap and returning
any memory from the reserved range to the right place, but it shouldn't
have to be that ugly.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19 5:34 ` Daniel Micay
0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-19 5:34 UTC (permalink / raw)
To: Andrew Morton, Shaohua Li
Cc: linux-mm, linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
Johannes Weiner, Michal Hocko, Andy Lutomirski,
Aliaksey Kandratsenka
[-- Attachment #1: Type: text/plain, Size: 2758 bytes --]
On 18/03/15 06:31 PM, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
>
>> There was a similar patch posted before, but it doesn't get merged. I'd like
>> to try again if there are more discussions.
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>>
>> mremap can be used to accelerate realloc. The problem is mremap will
>> punch a hole in original VMA, which makes specific memory allocator
>> unable to utilize it. Jemalloc is an example. It manages memory in 4M
>> chunks. mremap a range of the chunk will punch a hole, which other
>> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
>> can't handle it.
>
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour. It would be best to incorporate that into your
> changelog.
>
> Daniel also had microbenchmark testing results for glibc and jemalloc.
> Can you please do this?
>
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc. Has anyone
> tried it, or sought input from tcmalloc developers?
TCMalloc and jemalloc are currently equally slow in this benchmark, as
neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
the currently most active TCMalloc developer so they can give input
into whether this patch would let them use it.
#include <string.h>
#include <stdlib.h>
int main(void) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4 * 1024 * 1024; size < 1024 * 1024 * 1024; size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
If an outer loop is wrapped around this, jemalloc's master branch will
at least be able to do in-place resizing for everything after the 1st
run, but that's much rarer in the real world where there are many users
of the allocator. The lack of mremap still ends up hurting a lot.
FWIW, jemalloc is now the default allocator on Android so there are an
increasing number of Linux machines unable to leverage mremap. It could
be worked around by attempting to use an mmap hint to get the memory
back, but that can fail as it's a race with the other threads and that
leads increases fragmentation over the long term.
It's especially problematic if a large range of virtual memory is
reserved and divided up between per-CPU arenas for concurrency, but
only garbage collectors tend to do stuff like this at the moment. This
can still be dealt with by checking internal uses of mmap and returning
any memory from the reserved range to the right place, but it shouldn't
have to be that ugly.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-19 5:34 ` Daniel Micay
(?)
@ 2015-03-22 6:06 ` Aliaksey Kandratsenka
[not found] ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-03-23 5:17 ` Shaohua Li
-1 siblings, 2 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-22 6:06 UTC (permalink / raw)
To: Daniel Micay
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 9888 bytes --]
On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
wrote:
>
> On 18/03/15 06:31 PM, Andrew Morton wrote:
> > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> >
> >> There was a similar patch posted before, but it doesn't get merged.
I'd like
> >> to try again if there are more discussions.
> >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> >>
> >> mremap can be used to accelerate realloc. The problem is mremap will
> >> punch a hole in original VMA, which makes specific memory allocator
> >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> >> chunks. mremap a range of the chunk will punch a hole, which other
> >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> >> can't handle it.
> >
> > Daniel's changelog had additional details regarding the userspace
> > allocators' behaviour. It would be best to incorporate that into your
> > changelog.
> >
> > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > Can you please do this?
> >
> > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > confirmation that this patch will be useful for tcmalloc. Has anyone
> > tried it, or sought input from tcmalloc developers?
>
> TCMalloc and jemalloc are currently equally slow in this benchmark, as
> neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> the currently most active TCMalloc developer so they can give input
> into whether this patch would let them use it.
Hi.
Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).
Yes, that might be useful feature. (Assuming I understood it correctly) I
believe
tcmalloc would likely use:
mremap(old_ptr, move_size, move_size,
MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
new_ptr);
as optimized equivalent of:
memcpy(new_ptr, old_ptr, move_size);
madvise(old_ptr, move_size, MADV_DONTNEED);
And btw I find MREMAP_RETAIN name from original patch to be slightly more
intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
reflect semantic of this feature at all (assuming of course I correctly
understood what the patch does).
I do have a couple of questions about this approach however. Please feel
free to
educate me on them.
a) what is the smallest size where mremap is going to be faster ?
My initial thinking was that we'd likely use mremap in all cases where we
know
that touching destination would cause minor page faults (i.e. when
destination
chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
when
size is large enough, i.e. because "teleporting" large count of pages is
likely
to be faster than copying them.
But now I realize that it is more interesting than that. I.e. because as
Daniel
pointed out, mremap holds mmap_sem exclusively, while page faults are
holding it
for read. That could be optimized of course. Either by separate "teleport
ptes"
syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
write
and retaking it for read for "moving pages" part of work. Being not really
familiar with kernel code I have no idea if that's doable or not. But it
looks
like it might be quite important.
Another aspect where I am similarly illiterate is performance effect of tlb
flushes needed for such operation.
We can certainly experiment and find that limit. But if mremap threshold is
going to be large, then perhaps this kernel feature is not as useful as we
may
hope.
b) is that optimization worth having at all ?
After all, memcpy is actually known to be fast. I understand that copying
memory
in user space can be slowed down by minor page faults (results below seem to
confirm that). But this is something where either allocator may retain
populated
pages a bit longer or where kernel could help. E.g. maybe by exposing
something
similar to MAP_POPULATE in madvise, or even doing some safe combination of
madvise and MAP_UNINITIALIZED.
I've played with Daniel's original benchmark (copied from
http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
modifications:
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
int main(int argc, char **argv)
{
if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
if (rv) {
perror("mlockall");
abort();
}
puts("mlocked!");
}
for (size_t i = 0; i < 64; i++) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
/*
* void *hole = malloc(1 << 20);
* if (!hole) {
* perror("malloc");
* abort();
* }
*/
ptr = realloc(ptr, size);
if (!ptr) {
perror("realloc");
abort();
}
/* free(hole); */
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
free(ptr);
}
}
I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
important applications or not. It can be argued that apps that care about
such
large vectors can do mremap themselves.
On the other hand, I believe that this micro benchmark could be plausibly
changed to grow vector by smaller factor (i.e. see
https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
And
with smaller growth factor, is seems reasonable to expect larger overhead
from
memcpy and smaller overhead from mremap. And thus favor mremap more.
And I confirm that with all default settings tcmalloc and jemalloc lose to
glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
AFAIK) actually matches or exceeds glibc speed, despite still not doing
mremap. Apparently it is smarter about avoiding moving allocation for those
realloc-s. And it was even able to resist my attempt to force it to move
allocation. I haven't investigated why. Note that I built it couple weeks
or so
ago from dev branch, so it might simply have bugs.
Results also vary greatly depending in transparent huge pages setting.
Here's
what I've got:
allocator | mode | time | sys time | pgfaults | extra
----------+-----------+-------+----------+----------+-------------------------------
glibc | | 10.75 | 8.44 | 8388770 |
glibc | thp | 5.67 | 3.44 | 310882 |
glibc | mlock | 13.22 | 9.41 | 8388821 |
glibc | thp+mlock | 8.43 | 4.63 | 310933 |
tcmalloc | | 11.46 | 2.00 | 2104826 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc | thp | 10.61 | 0.89 | 386206 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc | mlock | 10.11 | 0.27 | 264721 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc | thp+mlock | 10.28 | 0.17 | 46011 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc | | 23.63 | 17.16 | 16770107 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc | thp | 11.82 | 5.14 | 352477 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc | mlock | 10.10 | 0.28 | 264724 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc | thp+mlock | 10.30 | 0.17 | 49168 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
jemalloc1 | | 23.71 | 17.33 | 16744572 |
jemalloc1 | thp | 11.65 | 4.68 | 64988 |
jemalloc1 | mlock | 10.13 | 0.29 | 263305 |
jemalloc1 | thp+mlock | 10.05 | 0.17 | 50217 |
jemalloc2 | | 10.87 | 8.64 | 8521796 |
jemalloc2 | thp | 4.64 | 2.32 | 56060 |
jemalloc2 | mlock | 4.22 | 0.28 | 263181 |
jemalloc2 | thp+mlock | 4.12 | 0.19 | 50411 |
----------+-----------+-------+----------+----------+-------------------------------
NOTE: usual disclaimer applies about possibility of screwing something up
and
getting invalid benchmark results without being able to see it. I apologize
in
advance.
NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
home-built snapshot of upcoming jemalloc 4.0.
NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
MADV_DONTNEED large free blocks immediately. As opposed to less rare with
setting of "false". And it makes big difference on page faults counts and
thus
on runtime.
Another notable thing is how mlock effectively disables MADV_DONTNEED for
jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
slightly better on runtime to glibc. The later spends a ton of time in
kernel,
probably handling minor page faults, and the former burns cpu in user space
doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
is
doing in this benchmark.
THP changes things however. Where apparently minor page faults become a lot
cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock case.
So
in THP case, cost of page faults is smaller than cost of large memcpy.
So results are somewhat mixed, but overall I'm not sure that I'm able to see
very convincing story for MREMAP_HOLE yet. However:
1) it is possible that I am missing something. If so, please, educate me.
2) if kernel implements this API, I'm going to use it in tcmalloc.
P.S. benchmark results also seem to indicate that tcmalloc could do
something to
explicitly enable THP and maybe better adapt to it's presence. Perhaps with
some
collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
causes people to disable THP.
[-- Attachment #2: Type: text/html, Size: 11931 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread[parent not found: <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-22 6:06 ` Aliaksey Kandratsenka
@ 2015-03-22 7:22 ` Daniel Micay
2015-03-23 5:17 ` Shaohua Li
1 sibling, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-22 7:22 UTC (permalink / raw)
To: Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
google-perftools-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org
[-- Attachment #1: Type: text/plain, Size: 7020 bytes --]
> Yes, that might be useful feature. (Assuming I understood it correctly)
> I believe
> tcmalloc would likely use:
>
> mremap(old_ptr, move_size, move_size,
> MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
> new_ptr);
>
> as optimized equivalent of:
>
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);
Yeah, it's essentially an optimized memcpy for when you don't need the
source allocation anymore.
> a) what is the smallest size where mremap is going to be faster ?
There are probably a lot of variables here like the CPU design and the
speed of system calls (syscall auditing makes them much slower!) in
addition to the stuff you've pointed out.
> My initial thinking was that we'd likely use mremap in all cases where
> we know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
> always when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
>
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate
> "teleport ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
> for write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.
I think it's doable but it would pessimize the case where the dest VMA
isn't reusable. It would need to optimistically take the reader lock to
find out and then drop it. However, userspace knows when this is surely
going to work and could give it a hint.
I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
case would be. It would be extremely specific though... they want the
kernel to move pages from a source VMA to a destination VMA where both
are anon/private with identical flags so only the reader lock is
necessary. On top of that, they really want to keep around as many
destination pages as possible, maybe by swapping as many as possible
back to the source.
That's *extremely* specific though and I now think the best way to get
there is by landing this feature and then extending it as necessary down
the road. An allocator may actually want to manage other kinds of
mappings itself and it would want the mmap_sem optimization to be an
optional hint.
> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple
> weeks or so
> ago from dev branch, so it might simply have bugs.
I submitted patches teaching jemalloc to expand/shrink huge allocations
in-place, so it's hitting the in-place resize path after the initial
iteration on a repeated reallocation benchmark that's not doing any
other allocations.
In jemalloc, everything is allocated via naturally aligned chunks (4M
before, recently down to 256k in master) so if you want to block
in-place huge reallocation you'll either need to force a new non-huge
chunk to be allocated or make one that's at least as large as the chunk
size.
I don't think in-place reallocation is very common in long-running
programs. It's probably more common now that jemalloc is experimenting
with first-fit for chunk/huge allocation rather than address-ordered
best-fit. The best-fit algorithm is designed to keep the opportunity for
in-place reallocation to a minimum, although address ordering does
counter it :).
> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts
> and thus
> on runtime.
>
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what
> glibc is
> doing in this benchmark.
When I taught jemalloc to use the MREMAP_RETAIN flag it was getting
significant wins over glibc, so this might be caused by the time spent
managing metadata, etc.
> THP changes things however. Where apparently minor page faults become a lot
> cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock
> case. So
> in THP case, cost of page faults is smaller than cost of large memcpy.
>
> So results are somewhat mixed, but overall I'm not sure that I'm able to see
> very convincing story for MREMAP_HOLE yet. However:
>
> 1) it is possible that I am missing something. If so, please, educate me.
>
> 2) if kernel implements this API, I'm going to use it in tcmalloc.
>
> P.S. benchmark results also seem to indicate that tcmalloc could do
> something to
> explicitly enable THP and maybe better adapt to it's presence. Perhaps
> with some
> collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
> causes people to disable THP.
BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
madvise purging. The part where khugepaged assigns huge pages to dense
spans of pages is *great*. The part where the kernel hands out a huge
page on for a fault in a 2M span can be awful. It causes the model
inside the allocator of uncommitted vs. committed pages to break down.
For example, the allocator might use 1M of a huge page and then start
purging. The purging will split it into 4k pages, so there will be 1M of
zeroed 4k pages that are considered purged by the allocator. Over time,
this can cripple purging. Search for "jemalloc huge pages" and you'll
find lots of horror stories about this.
I think a THP implementation playing that played well with purging would
need to drop the page fault heuristic and rely on a significantly better
khugepaged. This would mean faulting in a span of memory would no longer
be faster. Having a flag to populate a range with madvise would help a
lot though, since the allocator knows exactly how much it's going to
clobber with the memcpy. There will still be a threshold where mremap
gets significantly faster, but it would move it higher.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-22 7:22 ` Daniel Micay
0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-22 7:22 UTC (permalink / raw)
To: Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 7020 bytes --]
> Yes, that might be useful feature. (Assuming I understood it correctly)
> I believe
> tcmalloc would likely use:
>
> mremap(old_ptr, move_size, move_size,
> MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
> new_ptr);
>
> as optimized equivalent of:
>
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);
Yeah, it's essentially an optimized memcpy for when you don't need the
source allocation anymore.
> a) what is the smallest size where mremap is going to be faster ?
There are probably a lot of variables here like the CPU design and the
speed of system calls (syscall auditing makes them much slower!) in
addition to the stuff you've pointed out.
> My initial thinking was that we'd likely use mremap in all cases where
> we know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
> always when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
>
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate
> "teleport ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
> for write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.
I think it's doable but it would pessimize the case where the dest VMA
isn't reusable. It would need to optimistically take the reader lock to
find out and then drop it. However, userspace knows when this is surely
going to work and could give it a hint.
I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
case would be. It would be extremely specific though... they want the
kernel to move pages from a source VMA to a destination VMA where both
are anon/private with identical flags so only the reader lock is
necessary. On top of that, they really want to keep around as many
destination pages as possible, maybe by swapping as many as possible
back to the source.
That's *extremely* specific though and I now think the best way to get
there is by landing this feature and then extending it as necessary down
the road. An allocator may actually want to manage other kinds of
mappings itself and it would want the mmap_sem optimization to be an
optional hint.
> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple
> weeks or so
> ago from dev branch, so it might simply have bugs.
I submitted patches teaching jemalloc to expand/shrink huge allocations
in-place, so it's hitting the in-place resize path after the initial
iteration on a repeated reallocation benchmark that's not doing any
other allocations.
In jemalloc, everything is allocated via naturally aligned chunks (4M
before, recently down to 256k in master) so if you want to block
in-place huge reallocation you'll either need to force a new non-huge
chunk to be allocated or make one that's at least as large as the chunk
size.
I don't think in-place reallocation is very common in long-running
programs. It's probably more common now that jemalloc is experimenting
with first-fit for chunk/huge allocation rather than address-ordered
best-fit. The best-fit algorithm is designed to keep the opportunity for
in-place reallocation to a minimum, although address ordering does
counter it :).
> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts
> and thus
> on runtime.
>
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what
> glibc is
> doing in this benchmark.
When I taught jemalloc to use the MREMAP_RETAIN flag it was getting
significant wins over glibc, so this might be caused by the time spent
managing metadata, etc.
> THP changes things however. Where apparently minor page faults become a lot
> cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock
> case. So
> in THP case, cost of page faults is smaller than cost of large memcpy.
>
> So results are somewhat mixed, but overall I'm not sure that I'm able to see
> very convincing story for MREMAP_HOLE yet. However:
>
> 1) it is possible that I am missing something. If so, please, educate me.
>
> 2) if kernel implements this API, I'm going to use it in tcmalloc.
>
> P.S. benchmark results also seem to indicate that tcmalloc could do
> something to
> explicitly enable THP and maybe better adapt to it's presence. Perhaps
> with some
> collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
> causes people to disable THP.
BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
madvise purging. The part where khugepaged assigns huge pages to dense
spans of pages is *great*. The part where the kernel hands out a huge
page on for a fault in a 2M span can be awful. It causes the model
inside the allocator of uncommitted vs. committed pages to break down.
For example, the allocator might use 1M of a huge page and then start
purging. The purging will split it into 4k pages, so there will be 1M of
zeroed 4k pages that are considered purged by the allocator. Over time,
this can cripple purging. Search for "jemalloc huge pages" and you'll
find lots of horror stories about this.
I think a THP implementation playing that played well with purging would
need to drop the page fault heuristic and rely on a significantly better
khugepaged. This would mean faulting in a span of memory would no longer
be faster. Having a flag to populate a range with madvise would help a
lot though, since the allocator knows exactly how much it's going to
clobber with the memcpy. There will still be a threshold where mremap
gets significantly faster, but it would move it higher.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread[parent not found: <550E6D9D.1060507-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-22 7:22 ` Daniel Micay
@ 2015-03-24 4:36 ` Aliaksey Kandratsenka
-1 siblings, 0 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24 4:36 UTC (permalink / raw)
To: Daniel Micay
Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
google-perftools-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org
Hi.
First of all, I'd like to apologize for messing up formatting of my
past email. I've learned my lesson.
On Sun, Mar 22, 2015 at 12:22 AM, Daniel Micay <danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> My initial thinking was that we'd likely use mremap in all cases where
>> we know
>> that touching destination would cause minor page faults (i.e. when
>> destination
>> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
>> always when
>> size is large enough, i.e. because "teleporting" large count of pages is
>> likely
>> to be faster than copying them.
>>
>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate
>> "teleport ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
>> for write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> I think it's doable but it would pessimize the case where the dest VMA
> isn't reusable. It would need to optimistically take the reader lock to
> find out and then drop it. However, userspace knows when this is surely
> going to work and could give it a hint.
>
> I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
> case would be. It would be extremely specific though... they want the
> kernel to move pages from a source VMA to a destination VMA where both
> are anon/private with identical flags so only the reader lock is
> necessary. On top of that, they really want to keep around as many
> destination pages as possible, maybe by swapping as many as possible
> back to the source.
>
> That's *extremely* specific though and I now think the best way to get
> there is by landing this feature and then extending it as necessary down
> the road. An allocator may actually want to manage other kinds of
> mappings itself and it would want the mmap_sem optimization to be an
> optional hint.
Interesting. But what might be other users of MREMAP_NOHOLE/MREMAP_RETAIN ?
I believe it can be argued that "exchange vmas/pages" as separate
syscall is actually more general and thus possibly more useful thing
to have. Regardless of locking. And MREMAP_NOHOLE/MREMAP_RETAIN
functionality can be built on top of that syscall in userspace if
needed (with more than one syscall naturally, but maybe still with
relatively small overhead).
I'm not saying this is good idea, but just asking.
And here is another observation just to make sure that more options
are considered.
Given that mremap is holding mmap_sem exclusively, how about userspace
malloc implementation taking some exclusive malloc lock and doing
normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
might end up having largely same overhead. Well, modulo some extra TLB
flushing. But arguably, reducing TLB flushes for sequence of page
table updates could be usefully addressed separately (e.g. maybe by
matching those syscalls, maybe via syslets).
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-24 4:36 ` Aliaksey Kandratsenka
0 siblings, 0 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24 4:36 UTC (permalink / raw)
To: Daniel Micay
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
Hi.
First of all, I'd like to apologize for messing up formatting of my
past email. I've learned my lesson.
On Sun, Mar 22, 2015 at 12:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
>> My initial thinking was that we'd likely use mremap in all cases where
>> we know
>> that touching destination would cause minor page faults (i.e. when
>> destination
>> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
>> always when
>> size is large enough, i.e. because "teleporting" large count of pages is
>> likely
>> to be faster than copying them.
>>
>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate
>> "teleport ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
>> for write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> I think it's doable but it would pessimize the case where the dest VMA
> isn't reusable. It would need to optimistically take the reader lock to
> find out and then drop it. However, userspace knows when this is surely
> going to work and could give it a hint.
>
> I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
> case would be. It would be extremely specific though... they want the
> kernel to move pages from a source VMA to a destination VMA where both
> are anon/private with identical flags so only the reader lock is
> necessary. On top of that, they really want to keep around as many
> destination pages as possible, maybe by swapping as many as possible
> back to the source.
>
> That's *extremely* specific though and I now think the best way to get
> there is by landing this feature and then extending it as necessary down
> the road. An allocator may actually want to manage other kinds of
> mappings itself and it would want the mmap_sem optimization to be an
> optional hint.
Interesting. But what might be other users of MREMAP_NOHOLE/MREMAP_RETAIN ?
I believe it can be argued that "exchange vmas/pages" as separate
syscall is actually more general and thus possibly more useful thing
to have. Regardless of locking. And MREMAP_NOHOLE/MREMAP_RETAIN
functionality can be built on top of that syscall in userspace if
needed (with more than one syscall naturally, but maybe still with
relatively small overhead).
I'm not saying this is good idea, but just asking.
And here is another observation just to make sure that more options
are considered.
Given that mremap is holding mmap_sem exclusively, how about userspace
malloc implementation taking some exclusive malloc lock and doing
normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
might end up having largely same overhead. Well, modulo some extra TLB
flushing. But arguably, reducing TLB flushes for sequence of page
table updates could be usefully addressed separately (e.g. maybe by
matching those syscalls, maybe via syslets).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-24 4:36 ` Aliaksey Kandratsenka
(?)
@ 2015-03-24 14:54 ` Daniel Micay
-1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-24 14:54 UTC (permalink / raw)
To: Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 1840 bytes --]
> Given that mremap is holding mmap_sem exclusively, how about userspace
> malloc implementation taking some exclusive malloc lock and doing
> normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
> might end up having largely same overhead. Well, modulo some extra TLB
> flushing. But arguably, reducing TLB flushes for sequence of page
> table updates could be usefully addressed separately (e.g. maybe by
> matching those syscalls, maybe via syslets).
You can't use MAP_FIXED because it has a race with other users of mmap.
The address hint will *usually* work, but you need to deal with the case
where it fails and then cope with the fallout of the fragmentation.
PaX ASLR ignores address hints so that's something else to consider if
you care about running on PaX/Grsecurity patched kernels.
I'm doing this in my own allocator that's heavily based on the jemalloc
design. It just unmaps the memory given by the hinted mmap call if it
fails to get back the hole:
https://github.com/thestinger/allocator/blob/e80d2d0c2863c490b650ecffeb33beaccfcfdc46/huge.c#L167-L180
On 64-bit, it relies on 1TiB of reserved address space (works even with
overcommit disabled) to do per-CPU allocation for chunks and huge (>=
chunk size) allocations via address range checks so it also needs this
ugly workaround too:
https://github.com/thestinger/allocator/blob/e80d2d0c2863c490b650ecffeb33beaccfcfdc46/huge.c#L67-L75
I'm convinced that the mmap_sem writer lock can be avoided for the case
with MREMAP_FIXED via a good heuristic though. It just needs to check
that dst is a single VMA that matches the src properties and fall back
to the writer lock if that's not the case. This will have the same
performance as a separate syscall to move pages in all the cases where
that syscall would work.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-22 7:22 ` Daniel Micay
@ 2015-03-25 16:22 ` Vlastimil Babka
-1 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-25 16:22 UTC (permalink / raw)
To: Daniel Micay, Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
google-perftools-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org
On 03/22/2015 08:22 AM, Daniel Micay wrote:
> BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
> madvise purging. The part where khugepaged assigns huge pages to dense
> spans of pages is*great*. The part where the kernel hands out a huge
> page on for a fault in a 2M span can be awful. It causes the model
> inside the allocator of uncommitted vs. committed pages to break down.
>
> For example, the allocator might use 1M of a huge page and then start
> purging. The purging will split it into 4k pages, so there will be 1M of
> zeroed 4k pages that are considered purged by the allocator. Over time,
> this can cripple purging. Search for "jemalloc huge pages" and you'll
> find lots of horror stories about this.
I'm not sure I get your description right. The problem I know about is
where "purging" means madvise(MADV_DONTNEED) and khugepaged later
collapses a new hugepage that will repopulate the purged parts,
increasing the memory usage. One can limit this via
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
setting doesn't affect the page fault THP allocations, which however
happen only in newly accessed hugepage-sized areas and not partially
purged ones, though.
> I think a THP implementation playing that played well with purging would
> need to drop the page fault heuristic and rely on a significantly better
> khugepaged.
See here http://lwn.net/Articles/636162/ (the "Compaction" part)
The objection is that some short-lived workloads like gcc have to map
hugepages immediately if they are to benefit from them. I still plan to
improve khugepaged and allow admins to say that they don't want THP page
faults (and rely solely on khugepaged which has more information to
judge additional memory usage), but I'm not sure if it would be an
acceptable default behavior.
One workaround in the current state for jemalloc and friends could be to
use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it
wants to purge parts of them via madvise(MADV_DONTNEED). It could mean
overhead of another syscall and tracking of where this was applied and
when it makes sense to undo this and allow THP to be collapsed again,
though, and it would also split vma's.
> This would mean faulting in a span of memory would no longer
> be faster. Having a flag to populate a range with madvise would help a
If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also
a madvise(MADV_WILLNEED), which sounds like what you want, but I don't
know what the implementation does exactly - it was apparently added for
paging in ahead, and maybe it ignores unpopulated anonymous areas, but
it would probably be well in spirit of the flag to make it prepopulate
those.
> lot though, since the allocator knows exactly how much it's going to
> clobber with the memcpy. There will still be a threshold where mremap
> gets significantly faster, but it would move it higher.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-25 16:22 ` Vlastimil Babka
0 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-25 16:22 UTC (permalink / raw)
To: Daniel Micay, Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
On 03/22/2015 08:22 AM, Daniel Micay wrote:
> BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
> madvise purging. The part where khugepaged assigns huge pages to dense
> spans of pages is*great*. The part where the kernel hands out a huge
> page on for a fault in a 2M span can be awful. It causes the model
> inside the allocator of uncommitted vs. committed pages to break down.
>
> For example, the allocator might use 1M of a huge page and then start
> purging. The purging will split it into 4k pages, so there will be 1M of
> zeroed 4k pages that are considered purged by the allocator. Over time,
> this can cripple purging. Search for "jemalloc huge pages" and you'll
> find lots of horror stories about this.
I'm not sure I get your description right. The problem I know about is
where "purging" means madvise(MADV_DONTNEED) and khugepaged later
collapses a new hugepage that will repopulate the purged parts,
increasing the memory usage. One can limit this via
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
setting doesn't affect the page fault THP allocations, which however
happen only in newly accessed hugepage-sized areas and not partially
purged ones, though.
> I think a THP implementation playing that played well with purging would
> need to drop the page fault heuristic and rely on a significantly better
> khugepaged.
See here http://lwn.net/Articles/636162/ (the "Compaction" part)
The objection is that some short-lived workloads like gcc have to map
hugepages immediately if they are to benefit from them. I still plan to
improve khugepaged and allow admins to say that they don't want THP page
faults (and rely solely on khugepaged which has more information to
judge additional memory usage), but I'm not sure if it would be an
acceptable default behavior.
One workaround in the current state for jemalloc and friends could be to
use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it
wants to purge parts of them via madvise(MADV_DONTNEED). It could mean
overhead of another syscall and tracking of where this was applied and
when it makes sense to undo this and allow THP to be collapsed again,
though, and it would also split vma's.
> This would mean faulting in a span of memory would no longer
> be faster. Having a flag to populate a range with madvise would help a
If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also
a madvise(MADV_WILLNEED), which sounds like what you want, but I don't
know what the implementation does exactly - it was apparently added for
paging in ahead, and maybe it ignores unpopulated anonymous areas, but
it would probably be well in spirit of the flag to make it prepopulate
those.
> lot though, since the allocator knows exactly how much it's going to
> clobber with the memcpy. There will still be a threshold where mremap
> gets significantly faster, but it would move it higher.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-25 16:22 ` Vlastimil Babka
(?)
@ 2015-03-25 20:49 ` Daniel Micay
2015-03-25 20:54 ` Daniel Micay
[not found] ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
-1 siblings, 2 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-25 20:49 UTC (permalink / raw)
To: Vlastimil Babka, Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 3719 bytes --]
On 25/03/15 12:22 PM, Vlastimil Babka wrote:
>
> I'm not sure I get your description right. The problem I know about is
> where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> collapses a new hugepage that will repopulate the purged parts,
> increasing the memory usage. One can limit this via
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> setting doesn't affect the page fault THP allocations, which however
> happen only in newly accessed hugepage-sized areas and not partially
> purged ones, though.
Since jemalloc doesn't unmap memory but instead does recycling itself in
userspace, it ends up with large spans of free virtual memory and gets
*lots* of huge pages from the page fault heuristic. It keeps track of
active vs. dirty (not purged) vs. clean (purged / untouched) ranges
everywhere, and will purge dirty ranges as they build up.
The THP allocation on page faults mean it ends up with memory that's
supposed to be clean but is really not.
A worst case example with the (up until recently) default chunk size of
4M is allocating a bunch of 2.1M allocations. Chunks are naturally
aligned, so each one can be represented as 2 huge pages. It increases
memory usage by nearly *50%*. The allocator thinks the tail is clean
memory, but it's not. When the allocations are freed, it will purge the
2.1M at the head (once enough dirty memory builds up) but all of the
tail memory will be leaked until something else is allocated there and
then freed.
>> I think a THP implementation playing that played well with purging would
>> need to drop the page fault heuristic and rely on a significantly better
>> khugepaged.
>
> See here http://lwn.net/Articles/636162/ (the "Compaction" part)
>
> The objection is that some short-lived workloads like gcc have to map
> hugepages immediately if they are to benefit from them. I still plan to
> improve khugepaged and allow admins to say that they don't want THP page
> faults (and rely solely on khugepaged which has more information to
> judge additional memory usage), but I'm not sure if it would be an
> acceptable default behavior.
> One workaround in the current state for jemalloc and friends could be to
> use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it
> wants to purge parts of them via madvise(MADV_DONTNEED). It could mean
> overhead of another syscall and tracking of where this was applied and
> when it makes sense to undo this and allow THP to be collapsed again,
> though, and it would also split vma's.
Huge pages do significantly help performance though, and this would
pretty much mean no huge pages. The overhead of toggling it on and off
based on whether it's a < chunk size allocation or a >= chunk size one
is too high.
The page fault heuristic is just way too aggressive because there's no
indication of how much memory will be used. I don't think it makes sense
to do it without an explicit MADV_NOHUGEPAGE. Collapsing only dense
ranges doesn't have the same risk.
>> This would mean faulting in a span of memory would no longer
>> be faster. Having a flag to populate a range with madvise would help a
>
> If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also
> a madvise(MADV_WILLNEED), which sounds like what you want, but I don't
> know what the implementation does exactly - it was apparently added for
> paging in ahead, and maybe it ignores unpopulated anonymous areas, but
> it would probably be well in spirit of the flag to make it prepopulate
> those.
It doesn't seem to do anything for anon mappings atm but I do see a
patch from 2008 for that. I guess it never landed.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-25 20:49 ` Daniel Micay
@ 2015-03-25 20:54 ` Daniel Micay
[not found] ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
1 sibling, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-25 20:54 UTC (permalink / raw)
To: Vlastimil Babka, Aliaksey Kandratsenka
Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 304 bytes --]
> The page fault heuristic is just way too aggressive because there's no
> indication of how much memory will be used. I don't think it makes sense
> to do it without an explicit MADV_NOHUGEPAGE. Collapsing only dense
> ranges doesn't have the same risk.
Er, without an explicit MADV_HUGEPAGE*.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread[parent not found: <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>]
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-25 20:49 ` Daniel Micay
@ 2015-03-26 0:19 ` David Rientjes
[not found] ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
1 sibling, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26 0:19 UTC (permalink / raw)
To: Daniel Micay
Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton, Shaohua Li,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
Rik van Riel, Hugh Dickins, Mel Gorman, Johannes Weiner,
Michal Hocko, Andy Lutomirski,
google-perftools-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org
On Wed, 25 Mar 2015, Daniel Micay wrote:
> > I'm not sure I get your description right. The problem I know about is
> > where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> > collapses a new hugepage that will repopulate the purged parts,
> > increasing the memory usage. One can limit this via
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> > setting doesn't affect the page fault THP allocations, which however
> > happen only in newly accessed hugepage-sized areas and not partially
> > purged ones, though.
>
> Since jemalloc doesn't unmap memory but instead does recycling itself in
> userspace, it ends up with large spans of free virtual memory and gets
> *lots* of huge pages from the page fault heuristic. It keeps track of
> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
> everywhere, and will purge dirty ranges as they build up.
>
> The THP allocation on page faults mean it ends up with memory that's
> supposed to be clean but is really not.
>
> A worst case example with the (up until recently) default chunk size of
> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
> aligned, so each one can be represented as 2 huge pages. It increases
> memory usage by nearly *50%*. The allocator thinks the tail is clean
> memory, but it's not. When the allocations are freed, it will purge the
> 2.1M at the head (once enough dirty memory builds up) but all of the
> tail memory will be leaked until something else is allocated there and
> then freed.
>
With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges
for size classes <= 2MB, allocate its own metadata from an arena that is
also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the
longest span on the freelist when it does periodic memory freeing back to
the kernel, and even better if the freed memory splits at most one
hugepage. When memory is pulled from the freelist of memory that has
already been returned to the kernel, you can return a span that will make
it eligible to be collapsed into a hugepage based on your setting of
max_ptes_none, trying to consolidate the memory as much as possible. If
your malloc is implemented in a way to understand the benefit of
hugepages, and how much memory you're willing to sacrifice (max_ptes_none)
for it, then you should _never_ be increasing memory usage by 50%.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26 0:19 ` David Rientjes
0 siblings, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26 0:19 UTC (permalink / raw)
To: Daniel Micay
Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton, Shaohua Li,
linux-mm, linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
Johannes Weiner, Michal Hocko, Andy Lutomirski,
google-perftools@googlegroups.com
On Wed, 25 Mar 2015, Daniel Micay wrote:
> > I'm not sure I get your description right. The problem I know about is
> > where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> > collapses a new hugepage that will repopulate the purged parts,
> > increasing the memory usage. One can limit this via
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> > setting doesn't affect the page fault THP allocations, which however
> > happen only in newly accessed hugepage-sized areas and not partially
> > purged ones, though.
>
> Since jemalloc doesn't unmap memory but instead does recycling itself in
> userspace, it ends up with large spans of free virtual memory and gets
> *lots* of huge pages from the page fault heuristic. It keeps track of
> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
> everywhere, and will purge dirty ranges as they build up.
>
> The THP allocation on page faults mean it ends up with memory that's
> supposed to be clean but is really not.
>
> A worst case example with the (up until recently) default chunk size of
> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
> aligned, so each one can be represented as 2 huge pages. It increases
> memory usage by nearly *50%*. The allocator thinks the tail is clean
> memory, but it's not. When the allocations are freed, it will purge the
> 2.1M at the head (once enough dirty memory builds up) but all of the
> tail memory will be leaked until something else is allocated there and
> then freed.
>
With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges
for size classes <= 2MB, allocate its own metadata from an arena that is
also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the
longest span on the freelist when it does periodic memory freeing back to
the kernel, and even better if the freed memory splits at most one
hugepage. When memory is pulled from the freelist of memory that has
already been returned to the kernel, you can return a span that will make
it eligible to be collapsed into a hugepage based on your setting of
max_ptes_none, trying to consolidate the memory as much as possible. If
your malloc is implemented in a way to understand the benefit of
hugepages, and how much memory you're willing to sacrifice (max_ptes_none)
for it, then you should _never_ be increasing memory usage by 50%.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-26 0:19 ` David Rientjes
(?)
@ 2015-03-26 0:24 ` Daniel Micay
[not found] ` <551351CA.3090803-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
-1 siblings, 1 reply; 44+ messages in thread
From: Daniel Micay @ 2015-03-26 0:24 UTC (permalink / raw)
To: David Rientjes
Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton, Shaohua Li,
linux-mm, linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
Johannes Weiner, Michal Hocko, Andy Lutomirski,
google-perftools@googlegroups.com
[-- Attachment #1: Type: text/plain, Size: 2727 bytes --]
On 25/03/15 08:19 PM, David Rientjes wrote:
> On Wed, 25 Mar 2015, Daniel Micay wrote:
>
>>> I'm not sure I get your description right. The problem I know about is
>>> where "purging" means madvise(MADV_DONTNEED) and khugepaged later
>>> collapses a new hugepage that will repopulate the purged parts,
>>> increasing the memory usage. One can limit this via
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
>>> setting doesn't affect the page fault THP allocations, which however
>>> happen only in newly accessed hugepage-sized areas and not partially
>>> purged ones, though.
>>
>> Since jemalloc doesn't unmap memory but instead does recycling itself in
>> userspace, it ends up with large spans of free virtual memory and gets
>> *lots* of huge pages from the page fault heuristic. It keeps track of
>> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
>> everywhere, and will purge dirty ranges as they build up.
>>
>> The THP allocation on page faults mean it ends up with memory that's
>> supposed to be clean but is really not.
>>
>> A worst case example with the (up until recently) default chunk size of
>> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
>> aligned, so each one can be represented as 2 huge pages. It increases
>> memory usage by nearly *50%*. The allocator thinks the tail is clean
>> memory, but it's not. When the allocations are freed, it will purge the
>> 2.1M at the head (once enough dirty memory builds up) but all of the
>> tail memory will be leaked until something else is allocated there and
>> then freed.
>>
>
> With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges
> for size classes <= 2MB, allocate its own metadata from an arena that is
> also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the
> longest span on the freelist when it does periodic memory freeing back to
> the kernel, and even better if the freed memory splits at most one
> hugepage. When memory is pulled from the freelist of memory that has
> already been returned to the kernel, you can return a span that will make
> it eligible to be collapsed into a hugepage based on your setting of
> max_ptes_none, trying to consolidate the memory as much as possible. If
> your malloc is implemented in a way to understand the benefit of
> hugepages, and how much memory you're willing to sacrifice (max_ptes_none)
> for it, then you should _never_ be increasing memory usage by 50%.
If khugepaged was the only source of huge pages, sure. The primary
source of huge pages is the heuristic handing out an entire 2M page on
the first page fault in a 2M range.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-22 6:06 ` Aliaksey Kandratsenka
[not found] ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-03-23 5:17 ` Shaohua Li
2015-03-24 5:25 ` Aliaksey Kandratsenka
1 sibling, 1 reply; 44+ messages in thread
From: Shaohua Li @ 2015-03-23 5:17 UTC (permalink / raw)
To: Aliaksey Kandratsenka
Cc: Daniel Micay, Andrew Morton, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
On Sat, Mar 21, 2015 at 11:06:14PM -0700, Aliaksey Kandratsenka wrote:
> On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
> wrote:
> >
> > On 18/03/15 06:31 PM, Andrew Morton wrote:
> > > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> > >
> > >> There was a similar patch posted before, but it doesn't get merged.
> I'd like
> > >> to try again if there are more discussions.
> > >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> > >>
> > >> mremap can be used to accelerate realloc. The problem is mremap will
> > >> punch a hole in original VMA, which makes specific memory allocator
> > >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > >> chunks. mremap a range of the chunk will punch a hole, which other
> > >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > >> can't handle it.
> > >
> > > Daniel's changelog had additional details regarding the userspace
> > > allocators' behaviour. It would be best to incorporate that into your
> > > changelog.
> > >
> > > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > > Can you please do this?
> > >
> > > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > > confirmation that this patch will be useful for tcmalloc. Has anyone
> > > tried it, or sought input from tcmalloc developers?
> >
> > TCMalloc and jemalloc are currently equally slow in this benchmark, as
> > neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> > the currently most active TCMalloc developer so they can give input
> > into whether this patch would let them use it.
>
>
> Hi.
>
> Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).
>
> Yes, that might be useful feature. (Assuming I understood it correctly) I
> believe
> tcmalloc would likely use:
>
> mremap(old_ptr, move_size, move_size,
> MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
> new_ptr);
>
> as optimized equivalent of:
>
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);
>
> And btw I find MREMAP_RETAIN name from original patch to be slightly more
> intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
> reflect semantic of this feature at all (assuming of course I correctly
> understood what the patch does).
>
> I do have a couple of questions about this approach however. Please feel
> free to
> educate me on them.
>
> a) what is the smallest size where mremap is going to be faster ?
>
> My initial thinking was that we'd likely use mremap in all cases where we
> know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
> when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
>
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate "teleport
> ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
> write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.
Does mmap_sem contend in your workload? Otherwise, there is no big
difference of read or write lock. memcpy to new allocation could trigger
page fault, new page allocation overhead and etc.
> Another aspect where I am similarly illiterate is performance effect of tlb
> flushes needed for such operation.
MADV_DONTNEED does tlb flush too.
> We can certainly experiment and find that limit. But if mremap threshold is
> going to be large, then perhaps this kernel feature is not as useful as we
> may
> hope.
There are a lot of factors here:
For mremap, the overhead:
-mmap sem write lock
-tlb flush
For memcpy + madvise, the overhead:
-memcpy
-new address triggers page fault (allocate new pages, handle page fault)
-is old address MADV_DONTNEED? (tlb flush)
I thought unless allocator only uses memcpy (without madvise, then
allocator will use more memory as necessary) for small size memory
(while memcpy for small size memory is faster than tlb flush), mremap
is a win. We probably can measure the size of memcpy which has smaller
overhead than tlb flush
> b) is that optimization worth having at all ?
>
> After all, memcpy is actually known to be fast. I understand that copying
> memory
> in user space can be slowed down by minor page faults (results below seem to
> confirm that). But this is something where either allocator may retain
> populated
> pages a bit longer or where kernel could help. E.g. maybe by exposing
> something
> similar to MAP_POPULATE in madvise, or even doing some safe combination of
> madvise and MAP_UNINITIALIZED.
This option will make allocator use more memory than expected.
Eventually the memory must be reclaimed, which has big overhead too.
> I've played with Daniel's original benchmark (copied from
> http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
> modifications:
>
> #include <string.h>
> #include <stdlib.h>
> #include <stdio.h>
> #include <sys/mman.h>
>
> int main(int argc, char **argv)
> {
> if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
> int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
> if (rv) {
> perror("mlockall");
> abort();
> }
> puts("mlocked!");
> }
>
> for (size_t i = 0; i < 64; i++) {
> void *ptr = NULL;
> size_t old_size = 0;
> for (size_t size = 4; size < (1 << 30); size *= 2) {
> /*
> * void *hole = malloc(1 << 20);
> * if (!hole) {
> * perror("malloc");
> * abort();
> * }
> */
> ptr = realloc(ptr, size);
> if (!ptr) {
> perror("realloc");
> abort();
> }
> /* free(hole); */
> memset(ptr + old_size, 0xff, size - old_size);
> old_size = size;
> }
> free(ptr);
> }
> }
>
> I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
> important applications or not. It can be argued that apps that care about
> such
> large vectors can do mremap themselves.
>
> On the other hand, I believe that this micro benchmark could be plausibly
> changed to grow vector by smaller factor (i.e. see
> https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
> And
> with smaller growth factor, is seems reasonable to expect larger overhead
> from
> memcpy and smaller overhead from mremap. And thus favor mremap more.
>
> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple weeks
> or so
> ago from dev branch, so it might simply have bugs.
>
> Results also vary greatly depending in transparent huge pages setting.
> Here's
> what I've got:
>
> allocator | mode | time | sys time | pgfaults | extra
> ----------+-----------+-------+----------+----------+-------------------------------
> glibc | | 10.75 | 8.44 | 8388770 |
> glibc | thp | 5.67 | 3.44 | 310882 |
> glibc | mlock | 13.22 | 9.41 | 8388821 |
> glibc | thp+mlock | 8.43 | 4.63 | 310933 |
> tcmalloc | | 11.46 | 2.00 | 2104826 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc | thp | 10.61 | 0.89 | 386206 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc | mlock | 10.11 | 0.27 | 264721 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc | thp+mlock | 10.28 | 0.17 | 46011 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc | | 23.63 | 17.16 | 16770107 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc | thp | 11.82 | 5.14 | 352477 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc | mlock | 10.10 | 0.28 | 264724 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc | thp+mlock | 10.30 | 0.17 | 49168 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> jemalloc1 | | 23.71 | 17.33 | 16744572 |
> jemalloc1 | thp | 11.65 | 4.68 | 64988 |
> jemalloc1 | mlock | 10.13 | 0.29 | 263305 |
> jemalloc1 | thp+mlock | 10.05 | 0.17 | 50217 |
> jemalloc2 | | 10.87 | 8.64 | 8521796 |
> jemalloc2 | thp | 4.64 | 2.32 | 56060 |
> jemalloc2 | mlock | 4.22 | 0.28 | 263181 |
> jemalloc2 | thp+mlock | 4.12 | 0.19 | 50411 |
> ----------+-----------+-------+----------+----------+-------------------------------
>
> NOTE: usual disclaimer applies about possibility of screwing something up
> and
> getting invalid benchmark results without being able to see it. I apologize
> in
> advance.
>
> NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
> home-built snapshot of upcoming jemalloc 4.0.
>
> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts and
> thus
> on runtime.
>
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
> is
> doing in this benchmark.
mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
allocator will use more memory than expected.
I'm kind of confused why we talk about THP, mlock here. When application
uses allocator, it doesn't need to be forced to use THP or mlock. Can we
forcus on normal case?
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
2015-03-23 5:17 ` Shaohua Li
@ 2015-03-24 5:25 ` Aliaksey Kandratsenka
[not found] ` <CADpJO7zk8J3q7Bw9NibV9CzLarO+YkfeshyFTTq=XeS5qziBiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24 5:25 UTC (permalink / raw)
To: Shaohua Li
Cc: Daniel Micay, Andrew Morton, linux-mm, linux-api, Rik van Riel,
Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
Andy Lutomirski, google-perftools@googlegroups.com
Hi.
On Sun, Mar 22, 2015 at 10:17 PM, Shaohua Li <shli@fb.com> wrote:
> On Sat, Mar 21, 2015 at 11:06:14PM -0700, Aliaksey Kandratsenka wrote:
>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate "teleport
>> ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
>> write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> Does mmap_sem contend in your workload? Otherwise, there is no big
> difference of read or write lock. memcpy to new allocation could trigger
> page fault, new page allocation overhead and etc.
Well, I don't have any workloads. I'm just maintaining a library that
others run various workloads on. Part of the problem is lack of good
and varied malloc benchmarks which could allow us that prevent
regression. So this makes me a bit more cautious on performance
matters.
But I see your point. Indeed I have no evidence at all that exclusive
locking might cause observable performance difference.
>> b) is that optimization worth having at all ?
>>
>> After all, memcpy is actually known to be fast. I understand that copying
>> memory
>> in user space can be slowed down by minor page faults (results below seem to
>> confirm that). But this is something where either allocator may retain
>> populated
>> pages a bit longer or where kernel could help. E.g. maybe by exposing
>> something
>> similar to MAP_POPULATE in madvise, or even doing some safe combination of
>> madvise and MAP_UNINITIALIZED.
>
> This option will make allocator use more memory than expected.
> Eventually the memory must be reclaimed, which has big overhead too.
>
>> I've played with Daniel's original benchmark (copied from
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
>> modifications:
>>
...
>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
>> slightly better on runtime to glibc. The later spends a ton of time in
>> kernel,
>> probably handling minor page faults, and the former burns cpu in user space
>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
>> is
>> doing in this benchmark.
>
> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
> allocator will use more memory than expected.
Do not agree with unfair. I'm actually hoping MADV_FREE to provide
most if not all of benefits of mlock in this benchmark. I believe it's
not too unreasonable expectation.
>
> I'm kind of confused why we talk about THP, mlock here. When application
> uses allocator, it doesn't need to be forced to use THP or mlock. Can we
> forcus on normal case?
See my note on mlock above.
THP it is actually "normal". I know for certain, that many production
workloads are run on boxes with THP enabled. Red Hat famously ships
it's distros with THP set to "always". And I also know that some other
many production workloads are run on boxes with THP disabled. Also, as
seen above, "teleporting" pages is more efficient with THP due to much
smaller overhead of moving those pages. So I felt it was important not
to omit THP in my runs.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 44+ messages in thread
end of thread, other threads:[~2015-03-26 20:45 UTC | newest]
Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-03-17 21:09 [PATCH] mremap: add MREMAP_NOHOLE flag --resend Shaohua Li
2015-03-17 21:09 ` Shaohua Li
[not found] ` <deaa4139de6e6422a0cec1e3282553aed3495e94.1426626497.git.shli-b10kYP2dOMg@public.gmane.org>
2015-03-18 22:31 ` Andrew Morton
2015-03-18 22:31 ` Andrew Morton
[not found] ` <20150318153100.5658b741277f3717b52e42d9-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2015-03-19 5:08 ` Shaohua Li
2015-03-19 5:08 ` Shaohua Li
[not found] ` <20150319050826.GA1591708-XA4dbxeItU7BTsLV8vAZyg2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-03-19 5:22 ` Andrew Morton
2015-03-19 5:22 ` Andrew Morton
[not found] ` <20150318222246.bc608dd0.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2015-03-19 16:38 ` Shaohua Li
2015-03-19 16:38 ` Shaohua Li
2015-03-19 5:34 ` Daniel Micay
2015-03-19 5:34 ` Daniel Micay
2015-03-22 6:06 ` Aliaksey Kandratsenka
[not found] ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-03-22 7:22 ` Daniel Micay
2015-03-22 7:22 ` Daniel Micay
[not found] ` <550E6D9D.1060507-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-24 4:36 ` Aliaksey Kandratsenka
2015-03-24 4:36 ` Aliaksey Kandratsenka
2015-03-24 14:54 ` Daniel Micay
2015-03-25 16:22 ` Vlastimil Babka
2015-03-25 16:22 ` Vlastimil Babka
2015-03-25 20:49 ` Daniel Micay
2015-03-25 20:54 ` Daniel Micay
[not found] ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26 0:19 ` David Rientjes
2015-03-26 0:19 ` David Rientjes
2015-03-26 0:24 ` Daniel Micay
[not found] ` <551351CA.3090803-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26 2:31 ` David Rientjes
2015-03-26 2:31 ` David Rientjes
[not found] ` <alpine.DEB.2.10.1503251914260.16714-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2015-03-26 3:24 ` Daniel Micay
2015-03-26 3:24 ` Daniel Micay
[not found] ` <55137C06.9020608-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26 3:36 ` Daniel Micay
2015-03-26 3:36 ` Daniel Micay
2015-03-26 17:25 ` Vlastimil Babka
2015-03-26 17:25 ` Vlastimil Babka
2015-03-26 20:45 ` Daniel Micay
2015-03-23 5:17 ` Shaohua Li
2015-03-24 5:25 ` Aliaksey Kandratsenka
[not found] ` <CADpJO7zk8J3q7Bw9NibV9CzLarO+YkfeshyFTTq=XeS5qziBiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-03-24 14:39 ` Daniel Micay
2015-03-24 14:39 ` Daniel Micay
2015-03-25 5:02 ` Shaohua Li
2015-03-26 0:50 ` Minchan Kim
2015-03-26 1:21 ` Daniel Micay
2015-03-26 1:21 ` Daniel Micay
[not found] ` <55135F06.4000906-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26 7:02 ` Minchan Kim
2015-03-26 7:02 ` Minchan Kim
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.