* [PATCH v3] mm: add mremap flag for preserving the old mapping
@ 2014-09-30 4:55 Daniel Micay
[not found] ` <1412052900-1722-1-git-send-email-danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
0 siblings, 1 reply; 6+ messages in thread
From: Daniel Micay @ 2014-09-30 4:55 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, linux-api, akpm, jasone, Daniel Micay
This introduces the MREMAP_RETAIN flag for preserving the source mapping
when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
the source location will fault and cause fresh pages to be mapped in.
For consistency, the old_len >= new_len case could decommit the pages
instead of unmapping. However, userspace can accomplish the same thing
via madvise and a coherent definition of the flag is possible without
the extra complexity.
Motivation:
TCMalloc and jemalloc avoid releasing virtual memory in order to reduce
virtual memory fragmentation. A call to munmap or mremap would leave a
hole in the address space. Instead, unused pages are lazily returned to
the operating system via MADV_DONTNEED.
Since mremap cannot be used to elide copies, TCMalloc and jemalloc end
up being significantly slower for patterns like repeated vector / hash
table reallocations. Consider the typical vector building pattern:
#include <string.h>
#include <stdlib.h>
int main(void) {
void *ptr = NULL;
size_t old_size = 0;
for (size_t size = 4; size < (1 << 30); size *= 2) {
ptr = realloc(ptr, size);
if (!ptr) return 1;
memset(ptr + old_size, 0xff, size - old_size);
old_size = size;
}
}
glibc (baseline, uses mremap already): 0.135s
jemalloc without MREMAP_RETAIN: 0.226s
jemalloc with MREMAP_RETAIN: 0.112s
TCMalloc without MREMAP_RETAIN: 0.238s
(the improvement should be similar to jemalloc)
In practice, in-place growth never occurs because the heap grows in the
downwards direction for all 3 allocators. TCMalloc and jemalloc pay for
enormous copies while glibc is only spending time writing new elements
to the vector. Even if it was grown in the other direction, real-world
applications would end up blocking in-place growth with new allocations.
The allocators could attempt to map the source location again after an
mremap call, but there is no guarantee of success in a multi-threaded
program and fragmentating memory over time is considered unacceptable.
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
---
include/uapi/linux/mman.h | 1 +
mm/mremap.c | 39 ++++++++++++++++++++++++---------------
2 files changed, 25 insertions(+), 15 deletions(-)
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..4e9a546 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
+#define MREMAP_RETAIN 4
#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180..079334a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -235,7 +235,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr, bool retain,
+ bool *locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -287,15 +288,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- }
-
- /* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT) {
- vma->vm_flags &= ~VM_ACCOUNT;
- excess = vma->vm_end - vma->vm_start - old_len;
- if (old_addr > vma->vm_start &&
- old_addr + old_len < vma->vm_end)
- split = 1;
+ retain = false;
}
/*
@@ -310,6 +303,19 @@ static unsigned long move_vma(struct vm_area_struct *vma,
hiwater_vm = mm->hiwater_vm;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+ /* Leave the old mapping in place for MREMAP_RETAIN. */
+ if (retain)
+ goto out;
+
+ /* Conceal VM_ACCOUNT so old reservation is not undone */
+ if (vm_flags & VM_ACCOUNT) {
+ vma->vm_flags &= ~VM_ACCOUNT;
+ excess = vma->vm_end - vma->vm_start - old_len;
+ if (old_addr > vma->vm_start &&
+ old_addr + old_len < vma->vm_end)
+ split = 1;
+ }
+
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -324,6 +330,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}
+out:
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
@@ -392,7 +399,8 @@ Eagain:
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+ unsigned long new_addr, unsigned long new_len, bool retain,
+ bool *locked)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -442,7 +450,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (ret & ~PAGE_MASK)
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, retain, locked);
if (!(ret & ~PAGE_MASK))
goto out;
out1:
@@ -482,7 +490,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long charged = 0;
bool locked = false;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_RETAIN))
return ret;
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +514,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ flags & MREMAP_RETAIN, &locked);
goto out;
}
@@ -575,7 +583,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr,
+ flags & MREMAP_RETAIN, &locked);
}
out:
if (ret & ~PAGE_MASK)
--
2.1.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v3] mm: add mremap flag for preserving the old mapping
[not found] ` <1412052900-1722-1-git-send-email-danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2014-09-30 5:53 ` Andy Lutomirski
2014-09-30 9:36 ` Daniel Micay
0 siblings, 1 reply; 6+ messages in thread
From: Andy Lutomirski @ 2014-09-30 5:53 UTC (permalink / raw)
To: Daniel Micay
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Linux API,
Andrew Morton, jasone-Z38pQHDqNGNl57MIdRCFDg
On Mon, Sep 29, 2014 at 9:55 PM, Daniel Micay <danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> This introduces the MREMAP_RETAIN flag for preserving the source mapping
> when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
> the source location will fault and cause fresh pages to be mapped in.
>
> For consistency, the old_len >= new_len case could decommit the pages
> instead of unmapping. However, userspace can accomplish the same thing
> via madvise and a coherent definition of the flag is possible without
> the extra complexity.
IMO this needs very clear documentation of exactly what it does.
Does it preserve the contents of the source pages? (If so, why?
Aren't you wasting a bunch of time on page faults and possibly
unnecessary COWs?)
Does it work on file mappings? Can it extend file mappings while it moves them?
If you MREMAP_RETAIN a partially COWed private mapping, what happens?
Does it work on special mappings? If so, please prevent it from doing
so. mremapping x86's vdso is a thing, and duplicating x86's vdso
should not become a thing, because x86_32 in particular will become
extremely confused.
--Andy
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3] mm: add mremap flag for preserving the old mapping
2014-09-30 5:53 ` Andy Lutomirski
@ 2014-09-30 9:36 ` Daniel Micay
2014-09-30 17:49 ` Andy Lutomirski
0 siblings, 1 reply; 6+ messages in thread
From: Daniel Micay @ 2014-09-30 9:36 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Linux API,
Andrew Morton, jasone
On 30/09/14 01:53 AM, Andy Lutomirski wrote:
> On Mon, Sep 29, 2014 at 9:55 PM, Daniel Micay <danielmicay@gmail.com> wrote:
>> This introduces the MREMAP_RETAIN flag for preserving the source mapping
>> when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
>> the source location will fault and cause fresh pages to be mapped in.
>>
>> For consistency, the old_len >= new_len case could decommit the pages
>> instead of unmapping. However, userspace can accomplish the same thing
>> via madvise and a coherent definition of the flag is possible without
>> the extra complexity.
>
> IMO this needs very clear documentation of exactly what it does.
Agreed, and thanks for the review. I'll post a slightly modified version
of the patch soon (mostly more commit message changes).
> Does it preserve the contents of the source pages? (If so, why?
> Aren't you wasting a bunch of time on page faults and possibly
> unnecessary COWs?)
The source will act as if it was just created. For an anonymous memory
mapping, it will fault on any accesses and bring in new zeroed pages.
In jemalloc, it replaces an enormous memset(dst, src, size) followed by
madvise(src, size, MADV_DONTNEED) with mremap. Using mremap also ends up
eliding page faults from writes at the destination.
TCMalloc has nearly the same page allocation design, although it tries
to throttle the purging so it won't always gain as much.
> Does it work on file mappings? Can it extend file mappings while it moves them?
It works on file mappings. If a move occurs, there will be the usual
extended destination mapping but with the source mapping left intact.
It wouldn't be useful with existing allocators, but in theory a general
purpose allocator could expose an MMIO API in order to reuse the same
address space via MAP_FIXED/MREMAP_FIXED to reduce VM fragmentation.
> If you MREMAP_RETAIN a partially COWed private mapping, what happens?
The original mapping is zeroed in the following test, as it would be
without fork:
#define _GNU_SOURCE
#include <string.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/wait.h>
int main(void) {
size_t size = 1024 * 1024;
char *orig = mmap(NULL, size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
memset(orig, 5, size);
int pid = fork();
if (pid == -1)
return 1;
if (pid == 0) {
memset(orig, 5, 1024);
char *new = mremap(orig, size, size * 128, MREMAP_MAYMOVE|4);
if (new == orig) return 1;
for (size_t i = 0; i < size; i++)
if (new[i] != 5)
return 1;
for (size_t i = 0; i < size; i++)
if (orig[i] != 0)
return 1;
return 0;
}
int status;
if (wait(&status) < -1) return 1;
if (WIFEXITED(status))
return WEXITSTATUS(status);
return 1;
}
Hopefully this is the case you're referring to. :)
> Does it work on special mappings? If so, please prevent it from doing
> so. mremapping x86's vdso is a thing, and duplicating x86's vdso
> should not become a thing, because x86_32 in particular will become
> extremely confused.
I'll add a check for arch_vma_name(vma) == NULL.
There's an existing check for VM_DONTEXPAND | VM_PFNMAP when expanding
allocations (the only case this flag impacts). Are there other kinds of
special mappings that you're referring to?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3] mm: add mremap flag for preserving the old mapping
2014-09-30 9:36 ` Daniel Micay
@ 2014-09-30 17:49 ` Andy Lutomirski
2014-10-01 2:32 ` Daniel Micay
2014-10-02 21:09 ` Daniel Micay
0 siblings, 2 replies; 6+ messages in thread
From: Andy Lutomirski @ 2014-09-30 17:49 UTC (permalink / raw)
To: Daniel Micay
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Andrew Morton,
Jason Evans, Linux API
On Sep 30, 2014 2:36 AM, "Daniel Micay" <danielmicay@gmail.com> wrote:
>
> On 30/09/14 01:53 AM, Andy Lutomirski wrote:
> > On Mon, Sep 29, 2014 at 9:55 PM, Daniel Micay <danielmicay@gmail.com> wrote:
> >> This introduces the MREMAP_RETAIN flag for preserving the source mapping
> >> when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
> >> the source location will fault and cause fresh pages to be mapped in.
> >>
> >> For consistency, the old_len >= new_len case could decommit the pages
> >> instead of unmapping. However, userspace can accomplish the same thing
> >> via madvise and a coherent definition of the flag is possible without
> >> the extra complexity.
> >
> > IMO this needs very clear documentation of exactly what it does.
>
> Agreed, and thanks for the review. I'll post a slightly modified version
> of the patch soon (mostly more commit message changes).
>
> > Does it preserve the contents of the source pages? (If so, why?
> > Aren't you wasting a bunch of time on page faults and possibly
> > unnecessary COWs?)
>
> The source will act as if it was just created. For an anonymous memory
> mapping, it will fault on any accesses and bring in new zeroed pages.
>
> In jemalloc, it replaces an enormous memset(dst, src, size) followed by
> madvise(src, size, MADV_DONTNEED) with mremap. Using mremap also ends up
> eliding page faults from writes at the destination.
>
> TCMalloc has nearly the same page allocation design, although it tries
> to throttle the purging so it won't always gain as much.
>
> > Does it work on file mappings? Can it extend file mappings while it moves them?
>
> It works on file mappings. If a move occurs, there will be the usual
> extended destination mapping but with the source mapping left intact.
>
> It wouldn't be useful with existing allocators, but in theory a general
> purpose allocator could expose an MMIO API in order to reuse the same
> address space via MAP_FIXED/MREMAP_FIXED to reduce VM fragmentation.
>
> > If you MREMAP_RETAIN a partially COWed private mapping, what happens?
>
> The original mapping is zeroed in the following test, as it would be
> without fork:
>
> #define _GNU_SOURCE
>
> #include <string.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <unistd.h>
> #include <sys/wait.h>
>
> int main(void) {
> size_t size = 1024 * 1024;
> char *orig = mmap(NULL, size, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> memset(orig, 5, size);
> int pid = fork();
> if (pid == -1)
> return 1;
> if (pid == 0) {
> memset(orig, 5, 1024);
> char *new = mremap(orig, size, size * 128, MREMAP_MAYMOVE|4);
> if (new == orig) return 1;
> for (size_t i = 0; i < size; i++)
> if (new[i] != 5)
> return 1;
> for (size_t i = 0; i < size; i++)
> if (orig[i] != 0)
> return 1;
> return 0;
> }
> int status;
> if (wait(&status) < -1) return 1;
> if (WIFEXITED(status))
> return WEXITSTATUS(status);
> return 1;
> }
>
> Hopefully this is the case you're referring to. :)
What about private file mappings?
>
> > Does it work on special mappings? If so, please prevent it from doing
> > so. mremapping x86's vdso is a thing, and duplicating x86's vdso
> > should not become a thing, because x86_32 in particular will become
> > extremely confused.
>
> I'll add a check for arch_vma_name(vma) == NULL.
Careful! That function is deprecated in favor of vm_ops->name.
I think it might pay to add an explicit vm_op to authorize
duplication, especially for non-cow mappings. IOW this kind of
extension seems quite magical for anything that doesn't have the
normal COW semantics, including for plain old read-only mappings.
>
> There's an existing check for VM_DONTEXPAND | VM_PFNMAP when expanding
> allocations (the only case this flag impacts). Are there other kinds of
> special mappings that you're referring to?
I was referring to special mappings in the install_special_mapping
sense. Those may or may not have VM_PFNMAP set.
If VM_DONTEXPAND blocks this new feature entirely, that's probably good.
--Andy
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3] mm: add mremap flag for preserving the old mapping
2014-09-30 17:49 ` Andy Lutomirski
@ 2014-10-01 2:32 ` Daniel Micay
2014-10-02 21:09 ` Daniel Micay
1 sibling, 0 replies; 6+ messages in thread
From: Daniel Micay @ 2014-10-01 2:32 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Andrew Morton,
Jason Evans, Linux API
On 30/09/14 01:49 PM, Andy Lutomirski wrote:
>
> I think it might pay to add an explicit vm_op to authorize
> duplication, especially for non-cow mappings. IOW this kind of
> extension seems quite magical for anything that doesn't have the
> normal COW semantics, including for plain old read-only mappings.
This sounds like the best way forwards.
Setting up the op for private, anonymous mappings and having it check
vm_flags & VM_WRITE dynamically seems like it would be enough for the
intended use case in general purpose allocators. It can be extended to
other mapping types later if there's a compelling use case.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3] mm: add mremap flag for preserving the old mapping
2014-09-30 17:49 ` Andy Lutomirski
2014-10-01 2:32 ` Daniel Micay
@ 2014-10-02 21:09 ` Daniel Micay
1 sibling, 0 replies; 6+ messages in thread
From: Daniel Micay @ 2014-10-02 21:09 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Andrew Morton,
Jason Evans, Linux API
[-- Attachment #1: Type: text/plain, Size: 707 bytes --]
On 30/09/14 01:49 PM, Andy Lutomirski wrote:
>
> I think it might pay to add an explicit vm_op to authorize
> duplication, especially for non-cow mappings. IOW this kind of
> extension seems quite magical for anything that doesn't have the
> normal COW semantics, including for plain old read-only mappings.
Adding a vm_ops table to MAP_PRIVATE|MAP_ANONYMOUS mappings has a
significant performance impact. I haven't yet narrowed it down, but
there's at least one code path a check of `!vma->vm_ops` for the fast
path. One is for transparent huge page faults, so the performance impact
makes sense. I'll use a simpler implementation for now since the
requirements are very narrow / simple.
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-10-02 21:09 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-30 4:55 [PATCH v3] mm: add mremap flag for preserving the old mapping Daniel Micay
[not found] ` <1412052900-1722-1-git-send-email-danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2014-09-30 5:53 ` Andy Lutomirski
2014-09-30 9:36 ` Daniel Micay
2014-09-30 17:49 ` Andy Lutomirski
2014-10-01 2:32 ` Daniel Micay
2014-10-02 21:09 ` Daniel Micay
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).