From: David Gibson <david@gibson.dropbear.id.au>
To: Adam Litke <agl@us.ibm.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
hugh@veritas.com, rohit.seth@intel.com, "Chen,
Kenneth W" <kenneth.w.chen@intel.com>,
akpm@osdl.org
Subject: Re: [RFC 2/2] Hugetlb COW
Date: Tue, 8 Nov 2005 14:46:13 +1100 [thread overview]
Message-ID: <20051108034613.GD14336@localhost.localdomain> (raw)
In-Reply-To: <1131399533.25133.104.camel@localhost.localdomain>
On Mon, Nov 07, 2005 at 03:38:53PM -0600, Adam Litke wrote:
> [RFC] COW for hugepages
> (Patch originally from David Gibson <dwg@au1.ibm.com>)
Just realised there's at least one seriouss bug in the patch -
find_or_alloc_huge_page() can return a non-locked page when demand
faulting a PRIVATE mapping. That will cause the unlock_page() further
down to BUG(). Here's a corrected version.
This patch implements copy-on-write for hugepages, hence allowing
MAP_SHARED mappings of hugetlbfs.
This is chiefly useful for cases where we want to use hugepages
"automatically" - that is to map hugepages without the knowledge of
the code in the final application (either via kernel hooks, or with
LD_PRELOAD). We can use various heuristics to determine when
hugepages might be a good idea, but changing the semantics of
anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
knowledge is clearly wrong.
Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/fs/hugetlbfs/inode.c 2005-11-08 14:05:04.000000000 +1100
@@ -100,9 +100,6 @@
loff_t len, vma_len;
int ret;
- if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
- return -EINVAL;
-
if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
return -EINVAL;
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/include/linux/hugetlb.h 2005-11-08 14:05:04.000000000 +1100
@@ -65,6 +65,17 @@
pte_t *ptep);
#endif
+#define huge_ptep_set_wrprotect(mm, addr, ptep) ptep_set_wrprotect(mm, addr, ptep)
+static inline void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+}
+
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
#else
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-11-08 14:05:04.000000000 +1100
+++ working-2.6/mm/hugetlb.c 2005-11-08 14:21:13.000000000 +1100
@@ -255,11 +255,12 @@
.nopage = hugetlb_nopage,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;
- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,6 +278,9 @@
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -288,6 +292,8 @@
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -340,7 +346,7 @@
}
static struct page *find_or_alloc_huge_page(struct address_space *mapping,
- unsigned long idx)
+ unsigned long idx, int shared)
{
struct page *page;
int err;
@@ -359,26 +365,81 @@
return NULL;
}
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ if (shared) {
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
+ } else {
+ /* Caller expects a locked page */
+ lock_page(page);
}
return page;
}
-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
+{
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
+
+ old_page = pte_page(pte);
+
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
+
+ page_cache_get(old_page);
+ new_page = alloc_huge_page();
+
+ if (! new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ int write_access)
{
int ret;
unsigned long idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
+ pte_t new_pte;
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -386,10 +447,13 @@
/* This returns a locked page, which keeps us safe in the
* event of a race with truncate() */
- page = find_or_alloc_huge_page(mapping, idx);
+ page = find_or_alloc_huge_page(mapping, idx,
+ vma->vm_flags & VM_SHARED);
if (!page)
return VM_FAULT_SIGBUS;
+ BUG_ON(!PageLocked(page));
+
spin_lock(&mm->page_table_lock);
ret = VM_FAULT_SIGBUS;
@@ -405,7 +469,16 @@
goto backout;
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }
spin_unlock(&mm->page_table_lock);
unlock_page(page);
@@ -426,6 +499,7 @@
{
pte_t *ptep;
pte_t entry;
+ int ret;
ptep = huge_pte_alloc(mm, address);
if (! ptep)
@@ -434,12 +508,20 @@
entry = *ptep;
if (pte_none(entry))
- return hugetlb_no_page(mm, vma, address, ptep);
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);
- /* we could get here if another thread instantiated the pte
- * before the test above */
+ ret = VM_FAULT_MINOR;
- return VM_FAULT_MINOR;
+ spin_lock(&mm->page_table_lock);
+
+ if (likely(pte_same(entry, *ptep)))
+ /* pte could have changed before we grabbed the lock */
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
Index: working-2.6/mm/mmap.c
===================================================================
--- working-2.6.orig/mm/mmap.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/mm/mmap.c 2005-11-08 14:05:04.000000000 +1100
@@ -1076,7 +1076,7 @@
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED | VM_HUGETLB))
== (VM_WRITE | VM_RESERVED)) {
printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
"PROT_WRITE mmap of VM_RESERVED memory, which "
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
WARNING: multiple messages have this Message-ID (diff)
From: David Gibson <david@gibson.dropbear.id.au>
To: Adam Litke <agl@us.ibm.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
hugh@veritas.com, rohit.seth@intel.com, "Chen,
Kenneth W" <kenneth.w.chen@intel.com>,
akpm@osdl.org
Subject: Re: [RFC 2/2] Hugetlb COW
Date: Tue, 8 Nov 2005 14:46:13 +1100 [thread overview]
Message-ID: <20051108034613.GD14336@localhost.localdomain> (raw)
In-Reply-To: <1131399533.25133.104.camel@localhost.localdomain>
On Mon, Nov 07, 2005 at 03:38:53PM -0600, Adam Litke wrote:
> [RFC] COW for hugepages
> (Patch originally from David Gibson <dwg@au1.ibm.com>)
Just realised there's at least one seriouss bug in the patch -
find_or_alloc_huge_page() can return a non-locked page when demand
faulting a PRIVATE mapping. That will cause the unlock_page() further
down to BUG(). Here's a corrected version.
This patch implements copy-on-write for hugepages, hence allowing
MAP_SHARED mappings of hugetlbfs.
This is chiefly useful for cases where we want to use hugepages
"automatically" - that is to map hugepages without the knowledge of
the code in the final application (either via kernel hooks, or with
LD_PRELOAD). We can use various heuristics to determine when
hugepages might be a good idea, but changing the semantics of
anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
knowledge is clearly wrong.
Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/fs/hugetlbfs/inode.c 2005-11-08 14:05:04.000000000 +1100
@@ -100,9 +100,6 @@
loff_t len, vma_len;
int ret;
- if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
- return -EINVAL;
-
if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
return -EINVAL;
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/include/linux/hugetlb.h 2005-11-08 14:05:04.000000000 +1100
@@ -65,6 +65,17 @@
pte_t *ptep);
#endif
+#define huge_ptep_set_wrprotect(mm, addr, ptep) ptep_set_wrprotect(mm, addr, ptep)
+static inline void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+}
+
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
#else
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-11-08 14:05:04.000000000 +1100
+++ working-2.6/mm/hugetlb.c 2005-11-08 14:21:13.000000000 +1100
@@ -255,11 +255,12 @@
.nopage = hugetlb_nopage,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;
- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,6 +278,9 @@
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -288,6 +292,8 @@
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -340,7 +346,7 @@
}
static struct page *find_or_alloc_huge_page(struct address_space *mapping,
- unsigned long idx)
+ unsigned long idx, int shared)
{
struct page *page;
int err;
@@ -359,26 +365,81 @@
return NULL;
}
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ if (shared) {
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
+ } else {
+ /* Caller expects a locked page */
+ lock_page(page);
}
return page;
}
-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
+{
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
+
+ old_page = pte_page(pte);
+
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
+
+ page_cache_get(old_page);
+ new_page = alloc_huge_page();
+
+ if (! new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ int write_access)
{
int ret;
unsigned long idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
+ pte_t new_pte;
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -386,10 +447,13 @@
/* This returns a locked page, which keeps us safe in the
* event of a race with truncate() */
- page = find_or_alloc_huge_page(mapping, idx);
+ page = find_or_alloc_huge_page(mapping, idx,
+ vma->vm_flags & VM_SHARED);
if (!page)
return VM_FAULT_SIGBUS;
+ BUG_ON(!PageLocked(page));
+
spin_lock(&mm->page_table_lock);
ret = VM_FAULT_SIGBUS;
@@ -405,7 +469,16 @@
goto backout;
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }
spin_unlock(&mm->page_table_lock);
unlock_page(page);
@@ -426,6 +499,7 @@
{
pte_t *ptep;
pte_t entry;
+ int ret;
ptep = huge_pte_alloc(mm, address);
if (! ptep)
@@ -434,12 +508,20 @@
entry = *ptep;
if (pte_none(entry))
- return hugetlb_no_page(mm, vma, address, ptep);
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);
- /* we could get here if another thread instantiated the pte
- * before the test above */
+ ret = VM_FAULT_MINOR;
- return VM_FAULT_MINOR;
+ spin_lock(&mm->page_table_lock);
+
+ if (likely(pte_same(entry, *ptep)))
+ /* pte could have changed before we grabbed the lock */
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
Index: working-2.6/mm/mmap.c
===================================================================
--- working-2.6.orig/mm/mmap.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/mm/mmap.c 2005-11-08 14:05:04.000000000 +1100
@@ -1076,7 +1076,7 @@
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED | VM_HUGETLB))
== (VM_WRITE | VM_RESERVED)) {
printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
"PROT_WRITE mmap of VM_RESERVED memory, which "
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2005-11-08 3:46 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-11-07 21:10 [RFC 0/2] Copy on write for hugetlbfs Adam Litke
2005-11-07 21:10 ` Adam Litke
2005-11-07 21:38 ` [RFC 1/2] Hugetlb fault fixes and reorg Adam Litke
2005-11-07 21:38 ` Adam Litke
2005-11-07 23:30 ` William Lee Irwin III
2005-11-07 23:30 ` William Lee Irwin III
2005-11-08 1:21 ` David Gibson
2005-11-08 1:21 ` David Gibson
2005-11-07 21:38 ` [RFC 2/2] Hugetlb COW Adam Litke
2005-11-07 21:38 ` Adam Litke
2005-11-07 21:47 ` Adam Litke
2005-11-07 21:47 ` Adam Litke
2005-11-08 3:44 ` David Gibson
2005-11-08 3:44 ` David Gibson
2005-11-08 4:17 ` David Gibson
2005-11-08 4:17 ` David Gibson
2005-11-07 23:35 ` William Lee Irwin III
2005-11-07 23:35 ` William Lee Irwin III
2005-11-08 2:53 ` David Gibson
2005-11-08 2:53 ` David Gibson
2005-11-08 3:46 ` David Gibson [this message]
2005-11-08 3:46 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20051108034613.GD14336@localhost.localdomain \
--to=david@gibson.dropbear.id.au \
--cc=agl@us.ibm.com \
--cc=akpm@osdl.org \
--cc=hugh@veritas.com \
--cc=kenneth.w.chen@intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=rohit.seth@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.