- * [PATCH 01/10] mm,fs: introduce helpers around the i_mmap_mutex
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 02/10] mm: use new helper functions " Davidlohr Bueso
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
Various parts of the kernel acquire and release this mutex,
so add i_mmap_lock_write() and immap_unlock_write() helper
functions that will encapsulate this logic. The next patch
will make use of these.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 include/linux/fs.h | 10 ++++++++++
 1 file changed, 10 insertions(+)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a957d43..042d5c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,6 +454,16 @@ struct block_device {
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
+static inline void i_mmap_lock_write(struct address_space *mapping)
+{
+	mutex_lock(&mapping->i_mmap_mutex);
+}
+
+static inline void i_mmap_unlock_write(struct address_space *mapping)
+{
+	mutex_unlock(&mapping->i_mmap_mutex);
+}
+
 /*
  * Might pages of this file be mapped into userspace?
  */
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 02/10] mm: use new helper functions around the i_mmap_mutex
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 01/10] mm,fs: introduce helpers around the i_mmap_mutex Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 03/10] mm: convert i_mmap_mutex to rwsem Davidlohr Bueso
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
Convert all open coded mutex_lock/unlock calls to the
i_mmap_[lock/unlock]_write() helpers.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
 fs/hugetlbfs/inode.c    |  4 ++--
 kernel/events/uprobes.c |  4 ++--
 kernel/fork.c           |  4 ++--
 mm/filemap_xip.c        |  4 ++--
 mm/hugetlb.c            | 12 ++++++------
 mm/memory-failure.c     |  4 ++--
 mm/memory.c             |  8 ++++----
 mm/mmap.c               | 14 +++++++-------
 mm/mremap.c             |  4 ++--
 mm/nommu.c              | 14 +++++++-------
 mm/rmap.c               |  4 ++--
 11 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1e2872b..a082709 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a..d246a0a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -724,7 +724,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 	int more = 0;
 
  again:
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 		info->mm = vma->vm_mm;
 		info->vaddr = offset_to_vaddr(vma, offset);
 	}
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 
 	if (!more)
 		goto out;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746..915e8da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
-			mutex_lock(&mapping->i_mmap_mutex);
+			i_mmap_lock_write(mapping);
 			if (tmp->vm_flags & VM_SHARED)
 				atomic_inc(&mapping->i_mmap_writable);
 			flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				vma_interval_tree_insert_after(tmp, mpnt,
 							&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
-			mutex_unlock(&mapping->i_mmap_mutex);
+			i_mmap_unlock_write(mapping);
 		}
 
 		/*
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..bad746b 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -182,7 +182,7 @@ __xip_unmap (struct address_space * mapping,
 		return;
 
 retry:
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
@@ -202,7 +202,7 @@ retry:
 			page_cache_release(page);
 		}
 	}
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 
 	if (locked) {
 		mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd7227..fa86464 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2772,7 +2772,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * this mapping should be shared between all the VMAs,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
@@ -2789,7 +2789,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 			unmap_hugepage_range(iter_vma, address,
 					     address + huge_page_size(h), page);
 	}
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 }
 
 /*
@@ -3346,7 +3346,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, address, end);
 
 	mmu_notifier_invalidate_range_start(mm, start, end);
-	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += huge_page_size(h)) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address);
@@ -3374,7 +3374,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_tlb_range(vma, start, end);
-	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 
 	return pages << h->order;
@@ -3542,7 +3542,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	if (!vma_shareable(vma, addr))
 		return (pte_t *)pmd_alloc(mm, pud, addr);
 
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
@@ -3570,7 +3570,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 	return pte;
 }
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8639f6b..13708cf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -466,7 +466,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 	struct task_struct *tsk;
 	struct address_space *mapping = page->mapping;
 
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
 		pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +488,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 		}
 	}
 	read_unlock(&tasklist_lock);
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 }
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 1cc6bfb..d16c662 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1339,9 +1339,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 			 * safe to do nothing in this case.
 			 */
 			if (vma->vm_file) {
-				mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+				i_mmap_lock_write(vma->vm_file->f_mapping);
 				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
-				mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+				i_mmap_unlock_write(vma->vm_file->f_mapping);
 			}
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
@@ -2390,12 +2390,12 @@ void unmap_mapping_range(struct address_space *mapping,
 		details.last_index = ULONG_MAX;
 
 
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f85520..ac4d05a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
 
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 		__remove_shared_vm_struct(vma, file, mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 	}
 }
 
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 	}
 
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
 
 	if (mapping)
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 
 	mm->map_count++;
 	validate_mm(mm);
@@ -793,7 +793,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 							next->vm_end);
 		}
 
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 		if (insert) {
 			/*
 			 * Put into interval tree now, so instantiated pages
@@ -880,7 +880,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		anon_vma_unlock_write(anon_vma);
 	}
 	if (mapping)
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 
 	if (root) {
 		uprobe_mmap(vma);
@@ -3176,7 +3176,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
 		 * AS_MM_ALL_LOCKS can't change to 0 from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
 					&mapping->flags))
 			BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66..426b448 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (need_rmap_locks) {
 		if (vma->vm_file) {
 			mapping = vma->vm_file->f_mapping;
-			mutex_lock(&mapping->i_mmap_mutex);
+			i_mmap_lock_write(mapping);
 		}
 		if (vma->anon_vma) {
 			anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (anon_vma)
 		anon_vma_unlock_write(anon_vma);
 	if (mapping)
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e..52a5765 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
 		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 	}
 
 	/* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
 		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 	}
 
 	/* remove from the MM's tree and list */
@@ -2094,14 +2094,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	down_write(&nommu_region_sem);
-	mutex_lock(&inode->i_mapping->i_mmap_mutex);
+	i_mmap_lock_write(inode->i_mapping);
 
 	/* search for VMAs that fall within the dead zone */
 	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
-			mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+			i_mmap_unlock_write(inode->i_mapping);
 			up_write(&nommu_region_sem);
 			return -ETXTBSY; /* not quite true, but near enough */
 		}
@@ -2129,7 +2129,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 		}
 	}
 
-	mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+	i_mmap_unlock_write(inode->i_mapping);
 	up_write(&nommu_region_sem);
 	return 0;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a505..46fe32a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1674,7 +1674,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 
 	if (!mapping)
 		return ret;
-	mutex_lock(&mapping->i_mmap_mutex);
+	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 
@@ -1697,7 +1697,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 	ret = rwc->file_nonlinear(page, mapping, rwc->arg);
 
 done:
-	mutex_unlock(&mapping->i_mmap_mutex);
+	i_mmap_unlock_write(mapping);
 	return ret;
 }
 
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 03/10] mm: convert i_mmap_mutex to rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 01/10] mm,fs: introduce helpers around the i_mmap_mutex Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 02/10] mm: use new helper functions " Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:45   ` Kirill A. Shutemov
  2014-10-24 22:06 ` [PATCH 04/10] mm/rmap: share the i_mmap_rwsem Davidlohr Bueso
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
The i_mmap_mutex is a close cousin of the anon vma lock,
both protecting similar data, one for file backed pages
and the other for anon memory. To this end, this lock can
also be a rwsem. In addition, there are some important
opportunities to share the lock when there are no tree
modifications.
This conversion is straightforward. For now, all users take
the write lock.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
---
 fs/hugetlbfs/inode.c         | 10 +++++-----
 fs/inode.c                   |  2 +-
 include/linux/fs.h           |  7 ++++---
 include/linux/mmu_notifier.h |  2 +-
 kernel/events/uprobes.c      |  2 +-
 mm/filemap.c                 | 10 +++++-----
 mm/fremap.c                  |  4 ++--
 mm/hugetlb.c                 | 10 +++++-----
 mm/mmap.c                    |  8 ++++----
 mm/mremap.c                  |  2 +-
 mm/rmap.c                    |  6 +++---
 11 files changed, 32 insertions(+), 31 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a082709..5eba47f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 }
 
 /*
- * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
  * be taken from reclaim -- unlike regular filesystems. This needs an
  * annotation because huge_pmd_share() does an allocation under
- * i_mmap_mutex.
+ * i_mmap_rwsem.
  */
-static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
+static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 					struct inode *dir,
@@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		struct hugetlbfs_inode_info *info;
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
-		lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
-				&hugetlbfs_i_mmap_mutex_key);
+		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
+				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 26753ba..1732d6b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -349,7 +349,7 @@ void address_space_init_once(struct address_space *mapping)
 	memset(mapping, 0, sizeof(*mapping));
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
-	mutex_init(&mapping->i_mmap_mutex);
+	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 042d5c6..b183792 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -18,6 +18,7 @@
 #include <linux/pid.h>
 #include <linux/bug.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
@@ -388,7 +389,7 @@ struct address_space {
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
-	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
+	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	unsigned long		nrshadows;	/* number of shadow entries */
@@ -456,12 +457,12 @@ int mapping_tagged(struct address_space *mapping, int tag);
 
 static inline void i_mmap_lock_write(struct address_space *mapping)
 {
-	mutex_lock(&mapping->i_mmap_mutex);
+	down_write(&mapping->i_mmap_rwsem);
 }
 
 static inline void i_mmap_unlock_write(struct address_space *mapping)
 {
-	mutex_unlock(&mapping->i_mmap_mutex);
+	up_write(&mapping->i_mmap_rwsem);
 }
 
 /*
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 88787bb..ab8564b 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -154,7 +154,7 @@ struct mmu_notifier_ops {
  * Therefore notifier chains can only be traversed when either
  *
  * 1. mmap_sem is held.
- * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem).
+ * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
  * 3. No other concurrent thread can access the list (release)
  */
 struct mmu_notifier {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d246a0a..045b649 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -731,7 +731,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 
 		if (!prev && !more) {
 			/*
-			 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
 			 * reclaim. This is optimistic, no harm done if it fails.
 			 */
 			prev = kmalloc(sizeof(struct map_info),
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642..e8905bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_mutex		(truncate_pagecache)
+ *  ->i_mmap_rwsem		(truncate_pagecache)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_mutex		(truncate->unmap_mapping_range)
+ *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_mutex
+ *    ->i_mmap_rwsem
  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
@@ -85,7 +85,7 @@
  *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
- *  ->i_mmap_mutex
+ *  ->i_mmap_rwsem
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
@@ -105,7 +105,7 @@
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa3..11ef7ec 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -238,13 +238,13 @@ get_write_lock:
 			}
 			goto out_freed;
 		}
-		mutex_lock(&mapping->i_mmap_mutex);
+		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
 		vma_interval_tree_remove(vma, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
+		i_mmap_unlock_write(mapping);
 	}
 
 	if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fa86464..7eeab54 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2724,9 +2724,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 	 * on its way out.  We're lucky that the flag has such an appropriate
 	 * name, and can in fact be safely cleared here. We could clear it
 	 * before the __unmap_hugepage_range above, but all that's necessary
-	 * is to clear it before releasing the i_mmap_mutex. This works
+	 * is to clear it before releasing the i_mmap_rwsem. This works
 	 * because in the context this is called, the VMA is about to be
-	 * destroyed and the i_mmap_mutex is held.
+	 * destroyed and the i_mmap_rwsem is held.
 	 */
 	vma->vm_flags &= ~VM_MAYSHARE;
 }
@@ -3368,9 +3368,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		spin_unlock(ptl);
 	}
 	/*
-	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
 	 * may have cleared our pud entry and done put_page on the page table:
-	 * once we release i_mmap_mutex, another task can do the final put_page
+	 * once we release i_mmap_rwsem, another task can do the final put_page
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_tlb_range(vma, start, end);
@@ -3523,7 +3523,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
  * and returns the corresponding pte. While this is not necessary for the
  * !shared pmd case because we can allocate the pmd later as well, it makes the
  * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
  * bad pmd for sharing.
  */
diff --git a/mm/mmap.c b/mm/mmap.c
index ac4d05a..06c5b2b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
@@ -2785,7 +2785,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
  */
 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -3080,7 +3080,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
-		mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+		down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
 	}
 }
 
@@ -3107,7 +3107,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * vma in this mm is backed by the same anon_vma or address_space.
  *
  * We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
  * takes more than one of them in a row. Secondly we're protected
  * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
  *
diff --git a/mm/mremap.c b/mm/mremap.c
index 426b448..84aa36f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	spinlock_t *old_ptl, *new_ptl;
 
 	/*
-	 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
 	 * locks to ensure that rmap will always observe either the old or the
 	 * new ptes. This is the easiest way to avoid races with
 	 * truncate_pagecache(), page migration, etc...
diff --git a/mm/rmap.c b/mm/rmap.c
index 46fe32a..f234bc6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   mm->mmap_sem
  *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_mutex
+ *       mapping->i_mmap_rwsem
  *         anon_vma->rwsem
  *           mm->page_table_lock or pte_lock
  *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1244,7 +1244,7 @@ out_mlock:
 	/*
 	 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
 	 * unstable result and race. Plus, We can't wait here because
-	 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+	 * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
 	 * if trylock failed, the page remain in evictable lru and later
 	 * vmscan could retry to move the page to unevictable lru if the
 	 * page is actually mlocked.
@@ -1668,7 +1668,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 	 * The page lock not only makes sure that page->mapping cannot
 	 * suddenly be NULLified by truncation, it makes sure that the
 	 * structure at mapping cannot be freed and reused yet,
-	 * so we can safely take mapping->i_mmap_mutex.
+	 * so we can safely take mapping->i_mmap_rwsem.
 	 */
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * Re: [PATCH 03/10] mm: convert i_mmap_mutex to rwsem
  2014-10-24 22:06 ` [PATCH 03/10] mm: convert i_mmap_mutex to rwsem Davidlohr Bueso
@ 2014-10-24 22:45   ` Kirill A. Shutemov
  2014-10-28 20:27     ` Davidlohr Bueso
  0 siblings, 1 reply; 17+ messages in thread
From: Kirill A. Shutemov @ 2014-10-24 22:45 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: akpm, hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm
On Fri, Oct 24, 2014 at 03:06:13PM -0700, Davidlohr Bueso wrote:
> diff --git a/mm/fremap.c b/mm/fremap.c
> index 72b8fa3..11ef7ec 100644
> --- a/mm/fremap.c
> +++ b/mm/fremap.c
> @@ -238,13 +238,13 @@ get_write_lock:
>  			}
>  			goto out_freed;
>  		}
> -		mutex_lock(&mapping->i_mmap_mutex);
> +		i_mmap_lock_write(mapping);
>  		flush_dcache_mmap_lock(mapping);
>  		vma->vm_flags |= VM_NONLINEAR;
>  		vma_interval_tree_remove(vma, &mapping->i_mmap);
>  		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
>  		flush_dcache_mmap_unlock(mapping);
> -		mutex_unlock(&mapping->i_mmap_mutex);
> +		i_mmap_unlock_write(mapping);
>  	}
>  
>  	if (vma->vm_flags & VM_LOCKED) {
This should go to previous patch.
-- 
 Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH 03/10] mm: convert i_mmap_mutex to rwsem
  2014-10-24 22:45   ` Kirill A. Shutemov
@ 2014-10-28 20:27     ` Davidlohr Bueso
  0 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-28 20:27 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, hughd, riel, mgorman, peterz, mingo, linux-kernel, linux-mm
On Sat, 2014-10-25 at 01:45 +0300, Kirill A. Shutemov wrote:
> On Fri, Oct 24, 2014 at 03:06:13PM -0700, Davidlohr Bueso wrote:
> > diff --git a/mm/fremap.c b/mm/fremap.c
> > index 72b8fa3..11ef7ec 100644
> > --- a/mm/fremap.c
> > +++ b/mm/fremap.c
> > @@ -238,13 +238,13 @@ get_write_lock:
> >  			}
> >  			goto out_freed;
> >  		}
> > -		mutex_lock(&mapping->i_mmap_mutex);
> > +		i_mmap_lock_write(mapping);
> >  		flush_dcache_mmap_lock(mapping);
> >  		vma->vm_flags |= VM_NONLINEAR;
> >  		vma_interval_tree_remove(vma, &mapping->i_mmap);
> >  		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
> >  		flush_dcache_mmap_unlock(mapping);
> > -		mutex_unlock(&mapping->i_mmap_mutex);
> > +		i_mmap_unlock_write(mapping);
> >  	}
> >  
> >  	if (vma->vm_flags & VM_LOCKED) {
> 
> This should go to previous patch.
Indeed. I had forgotten I snuck that change in as when I was writing the
patch there was a conflict with that fremap. However you removed
mm/fremap.c altogether in -next (mm: replace remap_file_pages() syscall
with emulation) so I'll just update accordingly.
Thanks,
Davidlohr
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 17+ messages in thread
 
 
- * [PATCH 04/10] mm/rmap: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (2 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 03/10] mm: convert i_mmap_mutex to rwsem Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 05/10] uprobes: " Davidlohr Bueso
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
Similarly to the anon memory counterpart, we can share
the mapping's lock ownership as the interval tree is
not modified when doing doing the walk, only the file
page.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
---
 include/linux/fs.h | 10 ++++++++++
 mm/rmap.c          |  6 +++---
 2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b183792..1059be0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -465,6 +465,16 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
 	up_write(&mapping->i_mmap_rwsem);
 }
 
+static inline void i_mmap_lock_read(struct address_space *mapping)
+{
+	down_read(&mapping->i_mmap_rwsem);
+}
+
+static inline void i_mmap_unlock_read(struct address_space *mapping)
+{
+	up_read(&mapping->i_mmap_rwsem);
+}
+
 /*
  * Might pages of this file be mapped into userspace?
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index f234bc6..a77726f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1674,7 +1674,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 
 	if (!mapping)
 		return ret;
-	i_mmap_lock_write(mapping);
+
+	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 
@@ -1695,9 +1696,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 		goto done;
 
 	ret = rwc->file_nonlinear(page, mapping, rwc->arg);
-
 done:
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 	return ret;
 }
 
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 05/10] uprobes: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (3 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 04/10] mm/rmap: share the i_mmap_rwsem Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-27  7:03   ` Srikar Dronamraju
  2014-10-24 22:06 ` [PATCH 06/10] mm/xip: " Davidlohr Bueso
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
Both register and unregister call build_map_info() in order
to create the list of mappings before installing or removing
breakpoints for every mm which maps file backed memory. As
such, there is no reason to hold the i_mmap_rwsem exclusively,
so share it and allow concurrent readers to build the mapping
data.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 kernel/events/uprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 045b649..7a9e620 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -724,7 +724,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 	int more = 0;
 
  again:
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 		info->mm = vma->vm_mm;
 		info->vaddr = offset_to_vaddr(vma, offset);
 	}
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 
 	if (!more)
 		goto out;
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * Re: [PATCH 05/10] uprobes: share the i_mmap_rwsem
  2014-10-24 22:06 ` [PATCH 05/10] uprobes: " Davidlohr Bueso
@ 2014-10-27  7:03   ` Srikar Dronamraju
  2014-10-28  2:02     ` Oleg Nesterov
  0 siblings, 1 reply; 17+ messages in thread
From: Srikar Dronamraju @ 2014-10-27  7:03 UTC (permalink / raw)
  To: Davidlohr Bueso, Oleg Nesterov
  Cc: akpm, hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm
* Davidlohr Bueso <dave@stgolabs.net> [2014-10-24 15:06:15]:
> Both register and unregister call build_map_info() in order
> to create the list of mappings before installing or removing
> breakpoints for every mm which maps file backed memory. As
> such, there is no reason to hold the i_mmap_rwsem exclusively,
> so share it and allow concurrent readers to build the mapping
> data.
> 
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Copying Oleg (since he should have been copied on this one)
Please see one comment below.
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> 
> ---
>  kernel/events/uprobes.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 045b649..7a9e620 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -724,7 +724,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
>  	int more = 0;
>  
>   again:
> -	i_mmap_lock_write(mapping);
> +	i_mmap_lock_read(mapping);
>  	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>  		if (!valid_vma(vma, is_register))
>  			continue;
Just after this, we have
if (!prev && !more) {
	/*
	 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
	 * reclaim. This is optimistic, no harm done if it fails.
	 */
	prev = kmalloc(sizeof(struct map_info),
			GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
	if (prev)
		prev->next = NULL;
}
However in patch 02/10
I dont think the comment referring to i_mmap_mutex was modified to
refer i_mmap_lock_write.
When thats taken care off, this patch should change that part accordingly.
-- 
Thanks and Regards
Srikar Dronamraju
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 17+ messages in thread
- * Re: [PATCH 05/10] uprobes: share the i_mmap_rwsem
  2014-10-27  7:03   ` Srikar Dronamraju
@ 2014-10-28  2:02     ` Oleg Nesterov
  0 siblings, 0 replies; 17+ messages in thread
From: Oleg Nesterov @ 2014-10-28  2:02 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Davidlohr Bueso, akpm, hughd, riel, mgorman, peterz, mingo,
	linux-kernel, dbueso, linux-mm
On 10/27, Srikar Dronamraju wrote:
>
> Copying Oleg (since he should have been copied on this one)
Thanks ;)
> Please see one comment below.
>
> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> 
>
> > ---
> >  kernel/events/uprobes.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 045b649..7a9e620 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -724,7 +724,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
> >  	int more = 0;
> >  
> >   again:
> > -	i_mmap_lock_write(mapping);
> > +	i_mmap_lock_read(mapping);
I too think the patch is fine.
I didn't see other changes, but I hope that i_mmap_lock_write/read names
provide enough info and ->i_mmap_mutex was turned into rw-lock,  in this
case read-lock should be enough.
Oleg.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 17+ messages in thread 
 
 
- * [PATCH 06/10] mm/xip: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (4 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 05/10] uprobes: " Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 07/10] mm/memory-failure: " Davidlohr Bueso
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
__xip_unmap() will remove the xip sparse page from the cache
and take down pte mapping, without altering the interval tree,
thus share the i_mmap_rwsem when searching for the ptes to
unmap.
Additionally, tidy up the function a bit and make variables only
local to the interval tree walk loop.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 mm/filemap_xip.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index bad746b..0d105ae 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 EXPORT_SYMBOL_GPL(xip_file_read);
 
 /*
- * __xip_unmap is invoked from xip_unmap and
- * xip_write
+ * __xip_unmap is invoked from xip_unmap and xip_write
  *
  * This function walks all vmas of the address_space and unmaps the
  * __xip_sparse_page when found at pgoff.
  */
-static void
-__xip_unmap (struct address_space * mapping,
-		     unsigned long pgoff)
+static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
 {
 	struct vm_area_struct *vma;
-	struct mm_struct *mm;
-	unsigned long address;
-	pte_t *pte;
-	pte_t pteval;
-	spinlock_t *ptl;
 	struct page *page;
 	unsigned count;
 	int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
 		return;
 
 retry:
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-		mm = vma->vm_mm;
-		address = vma->vm_start +
+		pte_t *pte, pteval;
+		spinlock_t *ptl;
+		struct mm_struct *mm = vma->vm_mm;
+		unsigned long address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 		pte = page_check_address(page, mm, address, &ptl, 1);
 		if (pte) {
@@ -202,7 +197,7 @@ retry:
 			page_cache_release(page);
 		}
 	}
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 
 	if (locked) {
 		mutex_unlock(&xip_sparse_mutex);
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 07/10] mm/memory-failure: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (5 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 06/10] mm/xip: " Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 08/10] mm/mremap: " Davidlohr Bueso
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
No brainer conversion: collect_procs_file() only schedules
a process for later kill, share the lock, similarly to
the anon vma variant.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 mm/memory-failure.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 13708cf..ce7b0ec 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -466,7 +466,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 	struct task_struct *tsk;
 	struct address_space *mapping = page->mapping;
 
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
 		pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +488,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 		}
 	}
 	read_unlock(&tasklist_lock);
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 }
 
 /*
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 08/10] mm/mremap: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (6 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 07/10] mm/memory-failure: " Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:06 ` [PATCH 09/10] mm/nommu: " Davidlohr Bueso
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
As per the comment in move_ptes(), we only require taking the
anon vma and i_mmap locks to ensure that rmap will always observe
either the old or new ptes, in the case of need_rmap_lock=true.
No modifications to the tree itself, thus share the i_mmap_rwsem.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
XXX: the same *should* apply for the anon vma, as there are
no modifications to the chain.
 mm/mremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/mremap.c b/mm/mremap.c
index 84aa36f..a34500c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (need_rmap_locks) {
 		if (vma->vm_file) {
 			mapping = vma->vm_file->f_mapping;
-			i_mmap_lock_write(mapping);
+			i_mmap_lock_read(mapping);
 		}
 		if (vma->anon_vma) {
 			anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	if (anon_vma)
 		anon_vma_unlock_write(anon_vma);
 	if (mapping)
-		i_mmap_unlock_write(mapping);
+		i_mmap_unlock_read(mapping);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 09/10] mm/nommu: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (7 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 08/10] mm/mremap: " Davidlohr Bueso
@ 2014-10-24 22:06 ` Davidlohr Bueso
  2014-10-24 22:16 ` [PATCH 10/10] mm/hugetlb: " Davidlohr Bueso
  2014-10-28 14:08 ` [PATCH 00/10] mm: improve usage of the i_mmap lock Kirill A. Shutemov
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:06 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm, Davidlohr Bueso
Shrinking/truncate logic can call nommu_shrink_inode_mappings()
to verify that any shared mappings of the inode in question aren't
broken (dead zone). afaict the only user being ramfs to handle
the size change attribute.
Pretty much a no-brainer to share the lock.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 mm/nommu.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/mm/nommu.c b/mm/nommu.c
index 52a5765..cd519e1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -2094,14 +2094,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	down_write(&nommu_region_sem);
-	i_mmap_lock_write(inode->i_mapping);
+	i_mmap_lock_read(inode->i_mapping);
 
 	/* search for VMAs that fall within the dead zone */
 	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
-			i_mmap_unlock_write(inode->i_mapping);
+			i_mmap_unlock_read(inode->i_mapping);
 			up_write(&nommu_region_sem);
 			return -ETXTBSY; /* not quite true, but near enough */
 		}
@@ -2113,8 +2113,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	 * we don't check for any regions that start beyond the EOF as there
 	 * shouldn't be any
 	 */
-	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
-				  0, ULONG_MAX) {
+	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
 		if (!(vma->vm_flags & VM_SHARED))
 			continue;
 
@@ -2129,7 +2128,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 		}
 	}
 
-	i_mmap_unlock_write(inode->i_mapping);
+	i_mmap_unlock_read(inode->i_mapping);
 	up_write(&nommu_region_sem);
 	return 0;
 }
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * [PATCH 10/10] mm/hugetlb: share the i_mmap_rwsem
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (8 preceding siblings ...)
  2014-10-24 22:06 ` [PATCH 09/10] mm/nommu: " Davidlohr Bueso
@ 2014-10-24 22:16 ` Davidlohr Bueso
  2014-10-28 14:08 ` [PATCH 00/10] mm: improve usage of the i_mmap lock Kirill A. Shutemov
  10 siblings, 0 replies; 17+ messages in thread
From: Davidlohr Bueso @ 2014-10-24 22:16 UTC (permalink / raw)
  To: akpm
  Cc: hughd, riel, mgorman, peterz, mingo, dbueso, linux-kernel,
	linux-mm, Davidlohr Bueso
From: Davidlohr Bueso <dave@stgolabs.net>
The i_mmap_rwsem protects shared pages against races
when doing the sharing and unsharing, ultimately
calling huge_pmd_share/unshare() for PMD pages --
it also needs it to avoid races when populating the pud
for pmd allocation when looking for a shareable pmd page
for hugetlb. Ultimately the interval tree remains intact.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Resending this patch due to stupid email quota rules, *sigh*
 fs/hugetlbfs/inode.c |  4 ++--
 mm/hugetlb.c         | 12 ++++++------
 mm/memory.c          |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f..0dca54d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7eeab54..f68dd21 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2772,7 +2772,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * this mapping should be shared between all the VMAs,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
@@ -2789,7 +2789,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 			unmap_hugepage_range(iter_vma, address,
 					     address + huge_page_size(h), page);
 	}
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 }
 
 /*
@@ -3346,7 +3346,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, address, end);
 
 	mmu_notifier_invalidate_range_start(mm, start, end);
-	i_mmap_lock_write(vma->vm_file->f_mapping);
+	i_mmap_lock_read(vma->vm_file->f_mapping);
 	for (; address < end; address += huge_page_size(h)) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address);
@@ -3374,7 +3374,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_tlb_range(vma, start, end);
-	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	i_mmap_unlock_read(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 
 	return pages << h->order;
@@ -3542,7 +3542,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	if (!vma_shareable(vma, addr))
 		return (pte_t *)pmd_alloc(mm, pud, addr);
 
-	i_mmap_lock_write(mapping);
+	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
@@ -3570,7 +3570,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
-	i_mmap_unlock_write(mapping);
+	i_mmap_unlock_read(mapping);
 	return pte;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index d16c662..b1931c1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1339,9 +1339,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 			 * safe to do nothing in this case.
 			 */
 			if (vma->vm_file) {
-				i_mmap_lock_write(vma->vm_file->f_mapping);
+				i_mmap_lock_read(vma->vm_file->f_mapping);
 				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
-				i_mmap_unlock_write(vma->vm_file->f_mapping);
+				i_mmap_unlock_read(vma->vm_file->f_mapping);
 			}
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
-- 
1.8.4.5
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 17+ messages in thread
- * Re: [PATCH 00/10] mm: improve usage of the i_mmap lock
  2014-10-24 22:06 [PATCH 00/10] mm: improve usage of the i_mmap lock Davidlohr Bueso
                   ` (9 preceding siblings ...)
  2014-10-24 22:16 ` [PATCH 10/10] mm/hugetlb: " Davidlohr Bueso
@ 2014-10-28 14:08 ` Kirill A. Shutemov
  10 siblings, 0 replies; 17+ messages in thread
From: Kirill A. Shutemov @ 2014-10-28 14:08 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: akpm, hughd, riel, mgorman, peterz, mingo, linux-kernel, dbueso,
	linux-mm
On Fri, Oct 24, 2014 at 03:06:10PM -0700, Davidlohr Bueso wrote:
> Davidlohr Bueso (10):
>   mm,fs: introduce helpers around the i_mmap_mutex
>   mm: use new helper functions around the i_mmap_mutex
>   mm: convert i_mmap_mutex to rwsem
>   mm/rmap: share the i_mmap_rwsem
>   uprobes: share the i_mmap_rwsem
>   mm/xip: share the i_mmap_rwsem
>   mm/memory-failure: share the i_mmap_rwsem
>   mm/mremap: share the i_mmap_rwsem
>   mm/nommu: share the i_mmap_rwsem
>   mm/hugetlb: share the i_mmap_rwsem
> 
>  fs/hugetlbfs/inode.c         | 14 +++++++-------
>  fs/inode.c                   |  2 +-
>  include/linux/fs.h           | 23 ++++++++++++++++++++++-
>  include/linux/mmu_notifier.h |  2 +-
>  kernel/events/uprobes.c      |  6 +++---
>  kernel/fork.c                |  4 ++--
>  mm/filemap.c                 | 10 +++++-----
>  mm/filemap_xip.c             | 23 +++++++++--------------
>  mm/fremap.c                  |  4 ++--
>  mm/hugetlb.c                 | 22 +++++++++++-----------
>  mm/memory-failure.c          |  4 ++--
>  mm/memory.c                  |  8 ++++----
>  mm/mmap.c                    | 22 +++++++++++-----------
>  mm/mremap.c                  |  6 +++---
>  mm/nommu.c                   | 17 ++++++++---------
>  mm/rmap.c                    | 12 ++++++------
>  16 files changed, 97 insertions(+), 82 deletions(-)
Apart from already mentioned cosmetics, the patchset looks good to me.
Acked-by: Kirill A. Shutemov <kirill.shutemov@intel.linux.com>
-- 
 Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 17+ messages in thread