From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
To: linux-mm@kvack.org
Cc: Alexis Bruemmer <alexisb@us.ibm.com>,
Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH/RFC 4/9] mm: RCU vma lookups
Date: Mon, 22 Oct 2007 16:15:23 +0530 [thread overview]
Message-ID: <20071022104530.912224119@linux.vnet.ibm.com> (raw)
In-Reply-To: 20071022104518.985992030@linux.vnet.ibm.com
[-- Attachment #1: 4_mm-rcu.patch --]
[-- Type: text/plain, Size: 12516 bytes --]
mostly lockless vma lookup using the new b+tree
pin the vma using an atomic refcount
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
include/linux/init_task.h | 2
include/linux/mm.h | 9 +
kernel/fork.c | 2
mm/mmap.c | 216 +++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 206 insertions(+), 23 deletions(-)
--- linux-2.6.23-rc9.orig/include/linux/init_task.h
+++ linux-2.6.23-rc9/include/linux/init_task.h
@@ -48,7 +48,7 @@
#define INIT_MM(name) \
{ \
.mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \
- .mm_btree = BTREE_INIT(GFP_ATOMIC), \
+ .mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush), \
.pgd = swapper_pg_dir, \
.mm_users = ATOMIC_INIT(2), \
.mm_count = ATOMIC_INIT(1), \
--- linux-2.6.23-rc9.orig/include/linux/mm.h
+++ linux-2.6.23-rc9/include/linux/mm.h
@@ -104,12 +104,14 @@ struct vm_area_struct {
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+ struct rw_semaphore vm_sem;
#ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct rcu_head vm_rcu_head;
};
static inline struct vm_area_struct *
@@ -1078,6 +1080,8 @@ static inline void vma_nonlinear_insert(
}
/* mmap.c */
+extern void btree_rcu_flush(struct btree_freevec *);
+extern void free_vma(struct vm_area_struct *vma);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
@@ -1177,6 +1181,11 @@ extern struct vm_area_struct * find_vma(
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
+extern struct vm_area_struct * __find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked);
+extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr);
+extern void get_vma(struct vm_area_struct *vma);
+extern void put_vma(struct vm_area_struct *vma);
+
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
--- linux-2.6.23-rc9.orig/kernel/fork.c
+++ linux-2.6.23-rc9/kernel/fork.c
@@ -327,7 +327,7 @@ static void free_mm(struct mm_struct *mm
static struct mm_struct * mm_init(struct mm_struct * mm)
{
INIT_LIST_HEAD(&mm->mm_vmas);
- mm->mm_btree = BTREE_INIT(GFP_ATOMIC);
+ mm->mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
--- linux-2.6.23-rc9.orig/mm/mmap.c
+++ linux-2.6.23-rc9/mm/mmap.c
@@ -40,6 +40,18 @@ static void unmap_region(struct mm_struc
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
+static void __btree_rcu_flush(struct rcu_head *head)
+{
+ struct btree_freevec *freevec =
+ container_of(head, struct btree_freevec, rcu_head);
+ btree_freevec_flush(freevec);
+}
+
+void btree_rcu_flush(struct btree_freevec *freevec)
+{
+ call_rcu(&freevec->rcu_head, __btree_rcu_flush);
+}
+
/*
* WARNING: the debugging will use recursive algorithms so never enable this
* unless you know what you are doing.
@@ -218,6 +230,28 @@ void unlink_file_vma(struct vm_area_stru
}
}
+static void __free_vma(struct rcu_head *head)
+{
+ struct vm_area_struct *vma =
+ container_of(head, struct vm_area_struct, vm_rcu_head);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+void free_vma(struct vm_area_struct *vma)
+{
+ call_rcu(&vma->vm_rcu_head, __free_vma);
+}
+
+static void lock_vma(struct vm_area_struct *vma)
+{
+ down_write(&vma->vm_sem);
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+ up_write(&vma->vm_sem);
+}
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -230,7 +264,7 @@ static void remove_vma(struct vm_area_st
if (vma->vm_file)
fput(vma->vm_file);
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
return;
}
@@ -288,8 +322,15 @@ void validate_mm(struct mm_struct *mm)
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- list_for_each_entry(vma, &mm->mm_vmas, vm_list)
+ unsigned long addr = 0UL;
+ list_for_each_entry(vma, &mm->mm_vmas, vm_list) {
+ if (addr > vma->vm_start) {
+ printk("vma list unordered: %lu %lu\n",
+ addr, vma->vm_start);
+ }
+ addr = vma->vm_start;
i++;
+ }
if (i != mm->map_count)
printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
BUG_ON(bug);
@@ -328,6 +369,7 @@ __vma_link_list(struct mm_struct *mm, st
void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
{
int err;
+ init_rwsem(&vma->vm_sem);
err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
BUG_ON(err);
}
@@ -421,8 +463,8 @@ __vma_unlink(struct mm_struct *mm, struc
BUG();
}
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+ if (rcu_dereference(mm->mmap_cache) == vma)
+ rcu_assign_pointer(mm->mmap_cache, prev);
}
/*
@@ -438,6 +480,7 @@ void vma_adjust(struct vm_area_struct *v
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma_next(vma);
struct vm_area_struct *importer = NULL;
+ struct vm_area_struct *locked = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
struct file *file = vma->vm_file;
@@ -448,6 +491,14 @@ void vma_adjust(struct vm_area_struct *v
if (next && !insert) {
if (end >= next->vm_end) {
/*
+ * We need to lock the vma to force the lockless
+ * lookup into the slow path. Because there is a
+ * hole between removing the next vma and updating
+ * the current.
+ */
+ lock_vma(vma);
+ locked = vma;
+ /*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
*/
@@ -475,6 +526,25 @@ again: remove_next = 1 + (end > next->
}
}
+ if (insert) {
+ /*
+ * In order to make the adjust + insert look atomic wrt. the
+ * lockless lookups we need to force those into the slow path.
+ */
+ if (insert->vm_start < start) {
+ /*
+ * If the new vma is to be placed in front of the
+ * current one, we must lock the previous.
+ */
+ locked = vma_prev(vma);
+ if (!locked)
+ locked = vma;
+ } else
+ locked = vma;
+
+ lock_vma(locked);
+ }
+
btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path
if (file) {
@@ -521,6 +591,23 @@ again: remove_next = 1 + (end > next->
}
}
+ /*
+ * Remove the next vma before updating the address of the current,
+ * because it might end up having the address of next.
+ */
+ if (remove_next) {
+ /*
+ * vma_merge has merged next into vma, and needs
+ * us to remove next before dropping the locks.
+ */
+ lock_vma(next);
+ __vma_unlink(mm, next, vma);
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+ if (next->anon_vma)
+ __anon_vma_merge(vma, next);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -547,17 +634,11 @@ again: remove_next = 1 + (end > next->
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
- } else if (insert) {
+ /*
+ * Insert after updating the address of the current vma, because it
+ * might end up having the previous address.
+ */
+ if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
@@ -576,7 +657,7 @@ again: remove_next = 1 + (end > next->
fput(file);
mm->map_count--;
mpol_free(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ free_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -588,6 +669,13 @@ again: remove_next = 1 + (end > next->
}
}
+ if (locked) {
+ /*
+ * unlock the vma, enabling lockless lookups.
+ */
+ unlock_vma(locked);
+ }
+
validate_mm(mm);
}
@@ -1142,7 +1230,7 @@ munmap_back:
fput(file);
}
mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
}
out:
mm->total_vm += len >> PAGE_SHIFT;
@@ -1166,7 +1254,7 @@ unmap_and_free_vma:
unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
- kmem_cache_free(vm_area_cachep, vma);
+ free_vma(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -1390,13 +1478,13 @@ struct vm_area_struct * find_vma(struct
if (mm) {
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
+ vma = rcu_dereference(mm->mmap_cache);
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
vma = btree_stab(&mm->mm_btree, addr);
if (!vma || addr >= vma->vm_end)
vma = __vma_next(&mm->mm_vmas, vma);
if (vma)
- mm->mmap_cache = vma;
+ rcu_assign_pointer(mm->mmap_cache, vma);
}
}
return vma;
@@ -1404,6 +1492,90 @@ struct vm_area_struct * find_vma(struct
EXPORT_SYMBOL(find_vma);
+/*
+ * Differs only from the above in that it uses the slightly more expensive
+ * btree_stab_next() in order to avoid using the mm->mm_vmas list without
+ * locks.
+ */
+static
+struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL, *next;
+
+ if (mm) {
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = rcu_dereference(mm->mmap_cache);
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ vma = btree_stab_next(&mm->mm_btree, addr,
+ (void **)&next);
+ if (!vma || addr >= vma->vm_end)
+ vma = next;
+
+ if (vma)
+ rcu_assign_pointer(mm->mmap_cache, vma);
+ }
+ }
+ return vma;
+}
+
+/*
+ * Lockless lookup and pinning of vmas:
+ *
+ * In order to be able to do vma modifications and have them appear atomic
+ * we must sometimes still take the read lock. We do this when we fail to
+ * get a reference on the vma.
+ *
+ */
+struct vm_area_struct *
+__find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked)
+{
+ struct vm_area_struct *vma;
+
+ if (!mm)
+ return NULL;
+
+ rcu_read_lock();
+ vma = find_vma_rcu(mm, addr);
+ if (!vma || !down_read_trylock(&vma->vm_sem))
+ goto slow;
+ rcu_read_unlock();
+ return vma;
+
+slow:
+ rcu_read_unlock();
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+ if (vma)
+ down_read(&vma->vm_sem);
+ *locked = 1;
+ return vma;
+}
+
+struct vm_area_struct *
+find_get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ int locked = 0;
+ struct vm_area_struct *vma;
+
+ vma = __find_get_vma(mm, addr, &locked);
+ if (unlikely(locked))
+ up_read(&mm->mmap_sem);
+ return vma;
+}
+
+void get_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ down_read(&vma->vm_sem);
+}
+
+void put_vma(struct vm_area_struct *vma)
+{
+ if (likely(vma))
+ up_read(&vma->vm_sem);
+}
+
/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1687,8 +1859,10 @@ detach_vmas_to_be_unmapped(struct mm_str
do {
struct vm_area_struct *next = vma_next(vma);
+ lock_vma(vma);
__vma_unlink(mm, vma, NULL);
mm->map_count--;
+ unlock_vma(vma);
list_add_tail(&vma->vm_list, vmas);
vma = next;
} while (vma && vma->vm_start < end);
@@ -1697,7 +1871,7 @@ detach_vmas_to_be_unmapped(struct mm_str
else
addr = vma ? vma->vm_start : mm->mmap_base;
mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+ /* mm->mmap_cache = NULL;*/ /* Kill the cache. */
}
/*
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2007-10-22 10:43 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-10-22 10:45 [PATCH/RFC 0/9] VMA lookup with RCU Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 1/9] Data structure changes Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 2/9] lib: RCU friendly B+tree Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 3/9] mm: use the B+tree for vma lookups Vaidyanathan Srinivasan
2007-10-22 10:45 ` Vaidyanathan Srinivasan [this message]
2007-10-22 10:45 ` [PATCH/RFC 5/9] i386: rcu vma lookups for faults Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 6/9] x86_64: " Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 7/9] Add page fault code for PPC64 path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 8/9] debug: instrument the fault path Vaidyanathan Srinivasan
2007-10-22 10:45 ` [PATCH/RFC 9/9] mm: nr_ptes needs to be atomic Vaidyanathan Srinivasan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071022104530.912224119@linux.vnet.ibm.com \
--to=svaidy@linux.vnet.ibm.com \
--cc=a.p.zijlstra@chello.nl \
--cc=alexisb@us.ibm.com \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).