From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from d23relay03.au.ibm.com (d23relay03.au.ibm.com [202.81.18.234]) by e23smtp04.au.ibm.com (8.13.1/8.13.1) with ESMTP id l9MAhpvv014638 for ; Mon, 22 Oct 2007 20:43:51 +1000 Received: from d23av04.au.ibm.com (d23av04.au.ibm.com [9.190.235.139]) by d23relay03.au.ibm.com (8.13.8/8.13.8/NCO v8.5) with ESMTP id l9MAhshj1671422 for ; Mon, 22 Oct 2007 20:43:54 +1000 Received: from d23av04.au.ibm.com (loopback [127.0.0.1]) by d23av04.au.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id l9MAhcOl002893 for ; Mon, 22 Oct 2007 20:43:38 +1000 Message-Id: <20071022104530.912224119@linux.vnet.ibm.com>> References: <20071022104518.985992030@linux.vnet.ibm.com>> Date: Mon, 22 Oct 2007 16:15:23 +0530 From: Vaidyanathan Srinivasan Subject: [PATCH/RFC 4/9] mm: RCU vma lookups Content-Disposition: inline; filename=4_mm-rcu.patch Sender: owner-linux-mm@kvack.org Return-Path: To: linux-mm@kvack.org Cc: Alexis Bruemmer , Peter Zijlstra List-ID: mostly lockless vma lookup using the new b+tree pin the vma using an atomic refcount Signed-off-by: Peter Zijlstra Signed-off-by: Vaidyanathan Srinivasan --- include/linux/init_task.h | 2 include/linux/mm.h | 9 + kernel/fork.c | 2 mm/mmap.c | 216 +++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 206 insertions(+), 23 deletions(-) --- linux-2.6.23-rc9.orig/include/linux/init_task.h +++ linux-2.6.23-rc9/include/linux/init_task.h @@ -48,7 +48,7 @@ #define INIT_MM(name) \ { \ .mm_vmas = LIST_HEAD_INIT(name.mm_vmas), \ - .mm_btree = BTREE_INIT(GFP_ATOMIC), \ + .mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush), \ .pgd = swapper_pg_dir, \ .mm_users = ATOMIC_INIT(2), \ .mm_count = ATOMIC_INIT(1), \ --- linux-2.6.23-rc9.orig/include/linux/mm.h +++ linux-2.6.23-rc9/include/linux/mm.h @@ -104,12 +104,14 @@ struct vm_area_struct { void * vm_private_data; /* was vm_pte (shared mem) */ unsigned long vm_truncate_count;/* truncate_count or restart_addr */ + struct rw_semaphore vm_sem; #ifndef CONFIG_MMU atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif + struct rcu_head vm_rcu_head; }; static inline struct vm_area_struct * @@ -1078,6 +1080,8 @@ static inline void vma_nonlinear_insert( } /* mmap.c */ +extern void btree_rcu_flush(struct btree_freevec *); +extern void free_vma(struct vm_area_struct *vma); extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); @@ -1177,6 +1181,11 @@ extern struct vm_area_struct * find_vma( extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); +extern struct vm_area_struct * __find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked); +extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr); +extern void get_vma(struct vm_area_struct *vma); +extern void put_vma(struct vm_area_struct *vma); + /* Look up the first VMA which intersects the interval start_addr..end_addr-1, NULL if none. Assume start_addr < end_addr. */ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) --- linux-2.6.23-rc9.orig/kernel/fork.c +++ linux-2.6.23-rc9/kernel/fork.c @@ -327,7 +327,7 @@ static void free_mm(struct mm_struct *mm static struct mm_struct * mm_init(struct mm_struct * mm) { INIT_LIST_HEAD(&mm->mm_vmas); - mm->mm_btree = BTREE_INIT(GFP_ATOMIC); + mm->mm_btree = BTREE_INIT_FLUSH(GFP_ATOMIC, btree_rcu_flush); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); --- linux-2.6.23-rc9.orig/mm/mmap.c +++ linux-2.6.23-rc9/mm/mmap.c @@ -40,6 +40,18 @@ static void unmap_region(struct mm_struc struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); +static void __btree_rcu_flush(struct rcu_head *head) +{ + struct btree_freevec *freevec = + container_of(head, struct btree_freevec, rcu_head); + btree_freevec_flush(freevec); +} + +void btree_rcu_flush(struct btree_freevec *freevec) +{ + call_rcu(&freevec->rcu_head, __btree_rcu_flush); +} + /* * WARNING: the debugging will use recursive algorithms so never enable this * unless you know what you are doing. @@ -218,6 +230,28 @@ void unlink_file_vma(struct vm_area_stru } } +static void __free_vma(struct rcu_head *head) +{ + struct vm_area_struct *vma = + container_of(head, struct vm_area_struct, vm_rcu_head); + kmem_cache_free(vm_area_cachep, vma); +} + +void free_vma(struct vm_area_struct *vma) +{ + call_rcu(&vma->vm_rcu_head, __free_vma); +} + +static void lock_vma(struct vm_area_struct *vma) +{ + down_write(&vma->vm_sem); +} + +static void unlock_vma(struct vm_area_struct *vma) +{ + up_write(&vma->vm_sem); +} + /* * Close a vm structure and free it, returning the next. */ @@ -230,7 +264,7 @@ static void remove_vma(struct vm_area_st if (vma->vm_file) fput(vma->vm_file); mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); return; } @@ -288,8 +322,15 @@ void validate_mm(struct mm_struct *mm) int bug = 0; int i = 0; struct vm_area_struct *vma; - list_for_each_entry(vma, &mm->mm_vmas, vm_list) + unsigned long addr = 0UL; + list_for_each_entry(vma, &mm->mm_vmas, vm_list) { + if (addr > vma->vm_start) { + printk("vma list unordered: %lu %lu\n", + addr, vma->vm_start); + } + addr = vma->vm_start; i++; + } if (i != mm->map_count) printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; BUG_ON(bug); @@ -328,6 +369,7 @@ __vma_link_list(struct mm_struct *mm, st void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma) { int err; + init_rwsem(&vma->vm_sem); err = btree_insert(&mm->mm_btree, vma->vm_start, vma); BUG_ON(err); } @@ -421,8 +463,8 @@ __vma_unlink(struct mm_struct *mm, struc BUG(); } - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + if (rcu_dereference(mm->mmap_cache) == vma) + rcu_assign_pointer(mm->mmap_cache, prev); } /* @@ -438,6 +480,7 @@ void vma_adjust(struct vm_area_struct *v struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma_next(vma); struct vm_area_struct *importer = NULL; + struct vm_area_struct *locked = NULL; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; struct file *file = vma->vm_file; @@ -448,6 +491,14 @@ void vma_adjust(struct vm_area_struct *v if (next && !insert) { if (end >= next->vm_end) { /* + * We need to lock the vma to force the lockless + * lookup into the slow path. Because there is a + * hole between removing the next vma and updating + * the current. + */ + lock_vma(vma); + locked = vma; + /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). */ @@ -475,6 +526,25 @@ again: remove_next = 1 + (end > next-> } } + if (insert) { + /* + * In order to make the adjust + insert look atomic wrt. the + * lockless lookups we need to force those into the slow path. + */ + if (insert->vm_start < start) { + /* + * If the new vma is to be placed in front of the + * current one, we must lock the previous. + */ + locked = vma_prev(vma); + if (!locked) + locked = vma; + } else + locked = vma; + + lock_vma(locked); + } + btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL); // XXX error path if (file) { @@ -521,6 +591,23 @@ again: remove_next = 1 + (end > next-> } } + /* + * Remove the next vma before updating the address of the current, + * because it might end up having the address of next. + */ + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + lock_vma(next); + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + if (next->anon_vma) + __anon_vma_merge(vma, next); + } + if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -547,17 +634,11 @@ again: remove_next = 1 + (end > next-> flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); - } else if (insert) { + /* + * Insert after updating the address of the current vma, because it + * might end up having the previous address. + */ + if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks @@ -576,7 +657,7 @@ again: remove_next = 1 + (end > next-> fput(file); mm->map_count--; mpol_free(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -588,6 +669,13 @@ again: remove_next = 1 + (end > next-> } } + if (locked) { + /* + * unlock the vma, enabling lockless lookups. + */ + unlock_vma(locked); + } + validate_mm(mm); } @@ -1142,7 +1230,7 @@ munmap_back: fput(file); } mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); } out: mm->total_vm += len >> PAGE_SHIFT; @@ -1166,7 +1254,7 @@ unmap_and_free_vma: unmap_region(mm, &vmas, vma, prev, vma->vm_start, vma->vm_end); charged = 0; free_vma: - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -1390,13 +1478,13 @@ struct vm_area_struct * find_vma(struct if (mm) { /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; + vma = rcu_dereference(mm->mmap_cache); if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { vma = btree_stab(&mm->mm_btree, addr); if (!vma || addr >= vma->vm_end) vma = __vma_next(&mm->mm_vmas, vma); if (vma) - mm->mmap_cache = vma; + rcu_assign_pointer(mm->mmap_cache, vma); } } return vma; @@ -1404,6 +1492,90 @@ struct vm_area_struct * find_vma(struct EXPORT_SYMBOL(find_vma); +/* + * Differs only from the above in that it uses the slightly more expensive + * btree_stab_next() in order to avoid using the mm->mm_vmas list without + * locks. + */ +static +struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL, *next; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = rcu_dereference(mm->mmap_cache); + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + vma = btree_stab_next(&mm->mm_btree, addr, + (void **)&next); + if (!vma || addr >= vma->vm_end) + vma = next; + + if (vma) + rcu_assign_pointer(mm->mmap_cache, vma); + } + } + return vma; +} + +/* + * Lockless lookup and pinning of vmas: + * + * In order to be able to do vma modifications and have them appear atomic + * we must sometimes still take the read lock. We do this when we fail to + * get a reference on the vma. + * + */ +struct vm_area_struct * +__find_get_vma(struct mm_struct *mm, unsigned long addr, int *locked) +{ + struct vm_area_struct *vma; + + if (!mm) + return NULL; + + rcu_read_lock(); + vma = find_vma_rcu(mm, addr); + if (!vma || !down_read_trylock(&vma->vm_sem)) + goto slow; + rcu_read_unlock(); + return vma; + +slow: + rcu_read_unlock(); + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (vma) + down_read(&vma->vm_sem); + *locked = 1; + return vma; +} + +struct vm_area_struct * +find_get_vma(struct mm_struct *mm, unsigned long addr) +{ + int locked = 0; + struct vm_area_struct *vma; + + vma = __find_get_vma(mm, addr, &locked); + if (unlikely(locked)) + up_read(&mm->mmap_sem); + return vma; +} + +void get_vma(struct vm_area_struct *vma) +{ + if (likely(vma)) + down_read(&vma->vm_sem); +} + +void put_vma(struct vm_area_struct *vma) +{ + if (likely(vma)) + up_read(&vma->vm_sem); +} + /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, @@ -1687,8 +1859,10 @@ detach_vmas_to_be_unmapped(struct mm_str do { struct vm_area_struct *next = vma_next(vma); + lock_vma(vma); __vma_unlink(mm, vma, NULL); mm->map_count--; + unlock_vma(vma); list_add_tail(&vma->vm_list, vmas); vma = next; } while (vma && vma->vm_start < end); @@ -1697,7 +1871,7 @@ detach_vmas_to_be_unmapped(struct mm_str else addr = vma ? vma->vm_start : mm->mmap_base; mm->unmap_area(mm, addr); - mm->mmap_cache = NULL; /* Kill the cache. */ + /* mm->mmap_cache = NULL;*/ /* Kill the cache. */ } /* -- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org