[PATCH 0/4] Safely removing mmaped files

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] Safely removing mmaped files
@ 2009-09-04 19:24 Eric W. Biederman
  2009-09-04 19:25 ` [PATCH 1/4] mm: Introduce revoke_file_mappings Eric W. Biederman
  0 siblings, 1 reply; 6+ messages in thread
From: Eric W. Biederman @ 2009-09-04 19:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, linux-fsdevel, Alexey Dobriyan,
	Greg Kroah-Hartman, Tejun Heo

Currently when mmaped files are removed I have not found a single
instance in the kernel where we handle it correctly.  Frequently after
a hot remove we will either leak a file (with weird ensuing
consequences) or we will goof and not call vm_ops->close() which can
cause leaks.

It turns out this problem isn't too bad to actually fix and this
patchset is my generic solution.  Tested against 2.6.31-rc8 with a
process that mmaped /sys/*/*/resource0 and /proc/bus/pci/*/*.

I'm not certain what the best way to carry these patches is to get
them merged.  Andrew can you carry this patchset?

Eric

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/4] mm: Introduce revoke_file_mappings.
  2009-09-04 19:24 [PATCH 0/4] Safely removing mmaped files Eric W. Biederman
@ 2009-09-04 19:25 ` Eric W. Biederman
  2009-09-04 19:26   ` [PATCH 2/4] sysfs: Use revoke_file_mappings Eric W. Biederman
  2009-09-08 22:18   ` [PATCH 1/4] mm: Introduce revoke_file_mappings Andrew Morton
  0 siblings, 2 replies; 6+ messages in thread
From: Eric W. Biederman @ 2009-09-04 19:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, linux-fsdevel, Alexey Dobriyan,
	Greg Kroah-Hartman, Tejun Heo


When the backing store of a file becomes inaccessible we need a
function to remove that file from the page tables and arrange for page
faults to receive SIGBUS until the file is unmapped.

The current implementation in sysfs almost gets this correct by
intercepting vm_ops, but fails to call vm_ops->close on revoke and in
fact does not have quite enough information available to do so.  Which
can result in leaks for any vm_ops that depend on close to drop
reference counts.

It turns out that revoke_file_mapping is less code and a more straight
forward solution to the problem (except for the locking), as well as
being a general solution that can work for any mmapped and is not
limited to sysfs.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
---
 include/linux/mm.h |    2 +
 mm/memory.c        |  140 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a72cc7..eb6cecb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -790,6 +790,8 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 	unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
+extern void revoke_file_mappings(struct file *file);
+
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce..4b47116 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -55,6 +55,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <linux/file.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2410,6 +2411,145 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
+static int revoked_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct revoked_vm_ops = {
+	.fault	= revoked_fault,
+};
+
+static void revoke_one_file_vma(struct file *file,
+	struct mm_struct *mm, unsigned long vma_addr)
+{
+	unsigned long start_addr, end_addr, size;
+	struct vm_area_struct *vma;
+
+	/*
+	 * Must be called with mmap_sem held for write.
+	 *
+	 * Holding the mmap_sem prevents all vm_ops operations from
+	 * being called as well as preventing all other kinds of
+	 * modifications to the mm.
+	 */
+
+	/* Lookup a vma for my file address */
+	vma = find_vma(mm, vma_addr);
+	if (vma->vm_file != file)
+		return;
+
+	start_addr = vma->vm_start;
+	end_addr   = vma->vm_end;
+	size	   = end_addr - start_addr;
+
+	/* Unlock the pages */
+	if (mm->locked_vm && (vma->vm_flags & VM_LOCKED)) {
+		mm->locked_vm -= vma_pages(vma);
+		vma->vm_flags &= ~VM_LOCKED;
+	}
+
+	/* Unmap the vma */
+	zap_page_range(vma, start_addr, size, NULL);
+
+	/* Unlink the vma from the file */
+	unlink_file_vma(vma);
+
+	/* Close the vma */
+	if (vma->vm_ops && vma->vm_ops->close)
+		vma->vm_ops->close(vma);
+	fput(vma->vm_file);
+	vma->vm_file = NULL;
+	if (vma->vm_flags & VM_EXECUTABLE)
+		removed_exe_file_vma(vma->vm_mm);
+
+	/* Repurpose the vma  */
+	vma->vm_private_data = NULL;
+	vma->vm_ops = &revoked_vm_ops;
+	vma->vm_flags &= ~(VM_NONLINEAR | VM_CAN_NONLINEAR | VM_PFNMAP);
+}
+
+void revoke_file_mappings(struct file *file)
+{
+	/* After a file has been marked dead update the vmas */
+	struct address_space *mapping = file->f_mapping;
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	unsigned long start_address;
+	struct mm_struct *mm;
+	int mm_users;
+
+	/*
+	 * The locking here is a bit complex.
+	 *
+	 * - revoke_one_file_vma needs to be able to sleep so it can
+         *   call vm_ops->close().
+	 *
+	 * - i_mmap_lock needs to be held to iterate the list of vmas
+	 *   for a file.
+	 *
+	 * - The mm can be exiting when we find the vma on our list.
+	 *
+	 * - This function can not return until we can guarantee for
+	 *   all vmas associated with file that no vm_ops method will
+	 *   be called.
+	 *
+	 * This code increments mm_users to ensure that the mm will
+	 * not go away after it drops i_mmap_lock, and then grabs
+	 * mmap_sem for write to block all other modifications to the
+	 * mm, before refinding the the vma and removing it.
+	 *
+	 * If mm_users is already 0 indicated that exit_mmap is
+	 * running on the mm the code simply drop the locks and sleeps
+	 * giving exit_mmap a chance to finish.  If exit_mmap has not
+	 * freed our vma when we rescan the list we repeat until it has.
+	 */
+	spin_lock(&mapping->i_mmap_lock);
+restart_tree:
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
+		/* Skip quickly over vmas that do not need to be touched */
+		if (vma->vm_file != file)
+			continue;
+		start_address = vma->vm_start;
+		mm = vma->vm_mm;
+		mm_users = atomic_inc_not_zero(&mm->mm_users);
+		spin_unlock(&mapping->i_mmap_lock);
+		if (mm_users) {
+			down_write(&mm->mmap_sem);
+			revoke_one_file_vma(file, mm, start_address);
+			up_write(&mm->mmap_sem);
+			mmput(mm);
+		} else {
+			schedule(); /* wait for exit_mmap to remove the vma */
+		}
+		spin_lock(&mapping->i_mmap_lock);
+		goto restart_tree;
+	}
+
+restart_list:
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
+		/* Skip quickly over vmas that do not need to be touched */
+		if (vma->vm_file != file)
+			continue;
+		start_address = vma->vm_start;
+		mm = vma->vm_mm;
+		mm_users = atomic_inc_not_zero(&mm->mm_users);
+		spin_unlock(&mapping->i_mmap_lock);
+		if (mm_users) {
+			down_write(&mm->mmap_sem);
+			revoke_one_file_vma(file, mm, start_address);
+			up_write(&mm->mmap_sem);
+			mmput(mm);
+		} else {
+			schedule(); /* wait for exit_mmap to remove the vma */
+		}
+		spin_lock(&mapping->i_mmap_lock);
+		goto restart_list;
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
 /**
  * vmtruncate - unmap mappings "freed" by truncate() syscall
  * @inode: inode of the file used
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] sysfs: Use revoke_file_mappings
  2009-09-04 19:25 ` [PATCH 1/4] mm: Introduce revoke_file_mappings Eric W. Biederman
@ 2009-09-04 19:26   ` Eric W. Biederman
  2009-09-04 19:27     ` [PATCH 3/4] proc: Clean up mmaps when a proc file is removed Eric W. Biederman
  2009-09-08 22:18   ` [PATCH 1/4] mm: Introduce revoke_file_mappings Andrew Morton
  1 sibling, 1 reply; 6+ messages in thread
From: Eric W. Biederman @ 2009-09-04 19:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, linux-fsdevel, Alexey Dobriyan,
	Greg Kroah-Hartman, Tejun Heo


Now that we have a generic helper simply sysfs by using it.

This requires a bit of a logic change because revoke_file_mappings
does proper clean up of the vmas which means it will call
fput which can call release, which grabs sysfs_bin_lock.  So I remove
each bin_buffer from the list and drop sysfs_bin_lock before calling
revoke_file_mappings.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
---
 fs/sysfs/bin.c |  210 ++++----------------------------------------------------
 1 files changed, 13 insertions(+), 197 deletions(-)

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714..530459f 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -39,8 +39,6 @@ static DEFINE_MUTEX(sysfs_bin_lock);
 struct bin_buffer {
 	struct mutex			mutex;
 	void				*buffer;
-	int				mmapped;
-	struct vm_operations_struct 	*vm_ops;
 	struct file			*file;
 	struct hlist_node		list;
 };
@@ -175,175 +173,6 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	return count;
 }
 
-static void bin_vma_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops || !bb->vm_ops->open)
-		return;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return;
-
-	bb->vm_ops->open(vma);
-
-	sysfs_put_active_two(attr_sd);
-}
-
-static void bin_vma_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops || !bb->vm_ops->close)
-		return;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return;
-
-	bb->vm_ops->close(vma);
-
-	sysfs_put_active_two(attr_sd);
-}
-
-static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops || !bb->vm_ops->fault)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = bb->vm_ops->fault(vma, vmf);
-
-	sysfs_put_active_two(attr_sd);
-	return ret;
-}
-
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!bb->vm_ops->page_mkwrite)
-		return 0;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = bb->vm_ops->page_mkwrite(vma, vmf);
-
-	sysfs_put_active_two(attr_sd);
-	return ret;
-}
-
-static int bin_access(struct vm_area_struct *vma, unsigned long addr,
-		  void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops || !bb->vm_ops->access)
-		return -EINVAL;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return -EINVAL;
-
-	ret = bb->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active_two(attr_sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops || !bb->vm_ops->set_policy)
-		return 0;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return -EINVAL;
-
-	ret = bb->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active_two(attr_sd);
-	return ret;
-}
-
-static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
-					unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct mempolicy *pol;
-
-	if (!bb->vm_ops || !bb->vm_ops->get_policy)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return vma->vm_policy;
-
-	pol = bb->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active_two(attr_sd);
-	return pol;
-}
-
-static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			const nodemask_t *to, unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops || !bb->vm_ops->migrate)
-		return 0;
-
-	if (!sysfs_get_active_two(attr_sd))
-		return 0;
-
-	ret = bb->vm_ops->migrate(vma, from, to, flags);
-
-	sysfs_put_active_two(attr_sd);
-	return ret;
-}
-#endif
-
-static struct vm_operations_struct bin_vm_ops = {
-	.open		= bin_vma_open,
-	.close		= bin_vma_close,
-	.fault		= bin_fault,
-	.page_mkwrite	= bin_page_mkwrite,
-	.access		= bin_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= bin_set_policy,
-	.get_policy	= bin_get_policy,
-	.migrate	= bin_migrate,
-#endif
-};
-
 static int mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct bin_buffer *bb = file->private_data;
@@ -367,22 +196,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	if (rc)
 		goto out_put;
 
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
-		goto out_put;
-
 	rc = 0;
-	bb->mmapped = 1;
-	bb->vm_ops = vma->vm_ops;
-	vma->vm_ops = &bin_vm_ops;
 out_put:
 	sysfs_put_active_two(attr_sd);
 out_unlock:
@@ -440,7 +254,7 @@ static int release(struct inode * inode, struct file * file)
 	struct bin_buffer *bb = file->private_data;
 
 	mutex_lock(&sysfs_bin_lock);
-	hlist_del(&bb->list);
+	hlist_del_init(&bb->list);
 	mutex_unlock(&sysfs_bin_lock);
 
 	kfree(bb->buffer);
@@ -461,20 +275,22 @@ const struct file_operations bin_fops = {
 void unmap_bin_file(struct sysfs_dirent *attr_sd)
 {
 	struct bin_buffer *bb;
-	struct hlist_node *tmp;
 
 	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
 		return;
 
-	mutex_lock(&sysfs_bin_lock);
-
-	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
-		struct inode *inode = bb->file->f_path.dentry->d_inode;
-
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
-	}
-
-	mutex_unlock(&sysfs_bin_lock);
+	do {
+		bb = NULL;
+		mutex_lock(&sysfs_bin_lock);
+		if (!hlist_empty(&attr_sd->s_bin_attr.buffers)) {
+			bb = hlist_entry(attr_sd->s_bin_attr.buffers.first,
+					 struct bin_buffer, list);
+			hlist_del_init(&bb->list);
+		}
+		mutex_unlock(&sysfs_bin_lock);
+		if (bb)
+			revoke_file_mappings(bb->file);
+	} while (bb);
 }
 
 /**
-- 
1.6.2.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] proc: Clean up mmaps when a proc file is removed.
  2009-09-04 19:26   ` [PATCH 2/4] sysfs: Use revoke_file_mappings Eric W. Biederman
@ 2009-09-04 19:27     ` Eric W. Biederman
  2009-09-04 19:28       ` [PATCH 4/4] pci: Remove bogus check of proc dir entry usage Eric W. Biederman
  0 siblings, 1 reply; 6+ messages in thread
From: Eric W. Biederman @ 2009-09-04 19:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, linux-fsdevel, Alexey Dobriyan,
	Greg Kroah-Hartman, Tejun Heo


If a file such as /proc/bus/pci/*/* is mmaped and the underlying device
is hotunplugedd we can potentially run into all kinds of ugly things.

So implement unmap on remove by calling revoke_file_mappings.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
---
 fs/proc/generic.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678ab..42ce941 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -20,6 +20,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
+#include <linux/mm.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -833,6 +834,7 @@ continue_removing:
 		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
 		list_del(&pdeo->lh);
 		spin_unlock(&de->pde_unload_lock);
+		revoke_file_mappings(pdeo->file);
 		pdeo->release(pdeo->inode, pdeo->file);
 		kfree(pdeo);
 		spin_lock(&de->pde_unload_lock);
-- 
1.6.2.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] pci: Remove bogus check of proc dir entry usage.
  2009-09-04 19:27     ` [PATCH 3/4] proc: Clean up mmaps when a proc file is removed Eric W. Biederman
@ 2009-09-04 19:28       ` Eric W. Biederman
  0 siblings, 0 replies; 6+ messages in thread
From: Eric W. Biederman @ 2009-09-04 19:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, linux-fsdevel, Alexey Dobriyan,
	Greg Kroah-Hartman, Tejun Heo


This check prevents /proc/bus/pci/*/* from being removed when
pci devices are hot unpluged and someone happens to have it open.
This is not a problem because proc handles this case properly.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
---
 drivers/pci/proc.c |    2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 593bb84..afd2a6a 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -430,8 +430,6 @@ int pci_proc_detach_device(struct pci_dev *dev)
 	struct proc_dir_entry *e;
 
 	if ((e = dev->procent)) {
-		if (atomic_read(&e->count) > 1)
-			return -EBUSY;
 		remove_proc_entry(e->name, dev->bus->procdir);
 		dev->procent = NULL;
 	}
-- 
1.6.2.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/4] mm: Introduce revoke_file_mappings.
  2009-09-04 19:25 ` [PATCH 1/4] mm: Introduce revoke_file_mappings Eric W. Biederman
  2009-09-04 19:26   ` [PATCH 2/4] sysfs: Use revoke_file_mappings Eric W. Biederman
@ 2009-09-08 22:18   ` Andrew Morton
  1 sibling, 0 replies; 6+ messages in thread
From: Andrew Morton @ 2009-09-08 22:18 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-mm, linux-kernel, linux-fsdevel, adobriyan, gregkh, tj

On Fri, 04 Sep 2009 12:25:28 -0700
ebiederm@xmission.com (Eric W. Biederman) wrote:

> 
> When the backing store of a file becomes inaccessible we need a
> function to remove that file from the page tables and arrange for page
> faults to receive SIGBUS until the file is unmapped.
> 
> The current implementation in sysfs almost gets this correct by
> intercepting vm_ops, but fails to call vm_ops->close on revoke and in
> fact does not have quite enough information available to do so.  Which
> can result in leaks for any vm_ops that depend on close to drop
> reference counts.
> 
> It turns out that revoke_file_mapping is less code and a more straight
> forward solution to the problem (except for the locking), as well as
> being a general solution that can work for any mmapped and is not
> limited to sysfs.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
> ---
>  include/linux/mm.h |    2 +
>  mm/memory.c        |  140 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 142 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9a72cc7..eb6cecb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -790,6 +790,8 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
>  	unmap_mapping_range(mapping, holebegin, holelen, 0);
>  }
>  
> +extern void revoke_file_mappings(struct file *file);
> +
>  extern int vmtruncate(struct inode * inode, loff_t offset);
>  extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index aede2ce..4b47116 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c

mm/memory.c is large, messy and ill-defined.  I think this new code
would fit nicely into a new mm/revoke.c?

> @@ -55,6 +55,7 @@
>  #include <linux/kallsyms.h>
>  #include <linux/swapops.h>
>  #include <linux/elf.h>
> +#include <linux/file.h>
>  
>  #include <asm/pgalloc.h>
>  #include <asm/uaccess.h>
> @@ -2410,6 +2411,145 @@ void unmap_mapping_range(struct address_space *mapping,
>  }
>  EXPORT_SYMBOL(unmap_mapping_range);
>  
> +static int revoked_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +static struct vm_operations_struct revoked_vm_ops = {
> +	.fault	= revoked_fault,
> +};
> +
> +static void revoke_one_file_vma(struct file *file,
> +	struct mm_struct *mm, unsigned long vma_addr)

It'd be nice to have a comment explaining what this function does, how
it does it, etc.

> +{
> +	unsigned long start_addr, end_addr, size;
> +	struct vm_area_struct *vma;
> +
> +	/*
> +	 * Must be called with mmap_sem held for write.
> +	 *
> +	 * Holding the mmap_sem prevents all vm_ops operations from
> +	 * being called as well as preventing all other kinds of
> +	 * modifications to the mm.

Does it?  I'm surprised that we'd have 100% coverage for a rule this
broad.

> +	 */
> +
> +	/* Lookup a vma for my file address */
> +	vma = find_vma(mm, vma_addr);
> +	if (vma->vm_file != file)

What does it mean when this comparison failed?

> +		return;
> +
> +	start_addr = vma->vm_start;
> +	end_addr   = vma->vm_end;
> +	size	   = end_addr - start_addr;
> +
> +	/* Unlock the pages */

A "locked page" in the kernel has a specific meaning, and "being in a
VMA which has VM_LOCKED set" isn't it ;)

> +	if (mm->locked_vm && (vma->vm_flags & VM_LOCKED)) {
> +		mm->locked_vm -= vma_pages(vma);
> +		vma->vm_flags &= ~VM_LOCKED;

So if ->locked_vm happens to be zero, we don't clear VM_LOCKED.  Is
that a bug?

> +	}
> +
> +	/* Unmap the vma */

"Unmap the pages within the vma"

> +	zap_page_range(vma, start_addr, size, NULL);
> +
> +	/* Unlink the vma from the file */
> +	unlink_file_vma(vma);
> +
> +	/* Close the vma */
> +	if (vma->vm_ops && vma->vm_ops->close)
> +		vma->vm_ops->close(vma);
> +	fput(vma->vm_file);
> +	vma->vm_file = NULL;
> +	if (vma->vm_flags & VM_EXECUTABLE)
> +		removed_exe_file_vma(vma->vm_mm);
> +
> +	/* Repurpose the vma  */
> +	vma->vm_private_data = NULL;
> +	vma->vm_ops = &revoked_vm_ops;
> +	vma->vm_flags &= ~(VM_NONLINEAR | VM_CAN_NONLINEAR | VM_PFNMAP);

geeze, where did this decision come from?  Needs explanatory comments?

> +}
> +
> +void revoke_file_mappings(struct file *file)

Again, the semantics and intent of this code can only be divined by
reverse-engineering the implementation.  This makes it hard to review
and harder to maintainer.

> +{
> +	/* After a file has been marked dead update the vmas */
> +	struct address_space *mapping = file->f_mapping;
> +	struct vm_area_struct *vma;
> +	struct prio_tree_iter iter;
> +	unsigned long start_address;
> +	struct mm_struct *mm;
> +	int mm_users;
> +
> +	/*
> +	 * The locking here is a bit complex.
> +	 *
> +	 * - revoke_one_file_vma needs to be able to sleep so it can
> +         *   call vm_ops->close().

whitespace went funny.

> +	 *
> +	 * - i_mmap_lock needs to be held to iterate the list of vmas
> +	 *   for a file.
> +	 *
> +	 * - The mm can be exiting when we find the vma on our list.
> +	 *
> +	 * - This function can not return until we can guarantee for
> +	 *   all vmas associated with file that no vm_ops method will
> +	 *   be called.
> +	 *
> +	 * This code increments mm_users to ensure that the mm will
> +	 * not go away after it drops i_mmap_lock, and then grabs
> +	 * mmap_sem for write to block all other modifications to the
> +	 * mm, before refinding the the vma and removing it.
> +	 *
> +	 * If mm_users is already 0 indicated that exit_mmap is
> +	 * running on the mm the code simply drop the locks and sleeps
> +	 * giving exit_mmap a chance to finish.  If exit_mmap has not
> +	 * freed our vma when we rescan the list we repeat until it has.
> +	 */

All the above makes one wonder "in what contexts can this function be
called" and "what are its calling preconditions".


> +	spin_lock(&mapping->i_mmap_lock);
> +restart_tree:
> +	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
> +		/* Skip quickly over vmas that do not need to be touched */
> +		if (vma->vm_file != file)
> +			continue;

We check this again in revoke_one_file_vma().  How come?  Avoiding
races perhaps?  I don't know, and I don't know how to find out.

> +		start_address = vma->vm_start;
> +		mm = vma->vm_mm;
> +		mm_users = atomic_inc_not_zero(&mm->mm_users);
> +		spin_unlock(&mapping->i_mmap_lock);
> +		if (mm_users) {
> +			down_write(&mm->mmap_sem);
> +			revoke_one_file_vma(file, mm, start_address);
> +			up_write(&mm->mmap_sem);
> +			mmput(mm);
> +		} else {
> +			schedule(); /* wait for exit_mmap to remove the vma */

This doesn't "wait" for anything.  It's a no-op unless need_resched()
happens to be true.

> +		}
> +		spin_lock(&mapping->i_mmap_lock);
> +		goto restart_tree;
> +	}
> +
> +restart_list:
> +	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
> +		/* Skip quickly over vmas that do not need to be touched */
> +		if (vma->vm_file != file)
> +			continue;
> +		start_address = vma->vm_start;
> +		mm = vma->vm_mm;
> +		mm_users = atomic_inc_not_zero(&mm->mm_users);
> +		spin_unlock(&mapping->i_mmap_lock);
> +		if (mm_users) {
> +			down_write(&mm->mmap_sem);
> +			revoke_one_file_vma(file, mm, start_address);
> +			up_write(&mm->mmap_sem);
> +			mmput(mm);
> +		} else {
> +			schedule(); /* wait for exit_mmap to remove the vma */
> +		}
> +		spin_lock(&mapping->i_mmap_lock);
> +		goto restart_list;
> +	}
> +
> +	spin_unlock(&mapping->i_mmap_lock);
> +}
> +
>  /**
>   * vmtruncate - unmap mappings "freed" by truncate() syscall
>   * @inode: inode of the file used

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-09-08 22:18 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-04 19:24 [PATCH 0/4] Safely removing mmaped files Eric W. Biederman
2009-09-04 19:25 ` [PATCH 1/4] mm: Introduce revoke_file_mappings Eric W. Biederman
2009-09-04 19:26   ` [PATCH 2/4] sysfs: Use revoke_file_mappings Eric W. Biederman
2009-09-04 19:27     ` [PATCH 3/4] proc: Clean up mmaps when a proc file is removed Eric W. Biederman
2009-09-04 19:28       ` [PATCH 4/4] pci: Remove bogus check of proc dir entry usage Eric W. Biederman
2009-09-08 22:18   ` [PATCH 1/4] mm: Introduce revoke_file_mappings Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).