[PATCH 2/5] revoke: core code

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/5] revoke: core code
@ 2007-03-11 11:30 Pekka J Enberg
  2007-03-16  1:34 ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Pekka J Enberg @ 2007-03-11 11:30 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, hch, alan

From: Pekka Enberg <penberg@cs.helsinki.fi>

The revokeat(2) and frevoke(2) system calls invalidate open file
descriptors and shared mappings of an inode. After an successful
revocation, operations on file descriptors fail with the EBADF or
ENXIO error code for regular and device files,
respectively. Attempting to read from or write to a revoked mapping
causes SIGBUS.

The actual operation is done in two passes:

 1. Revoke all file descriptors that point to the given inode. We do
    this under tasklist_lock so that after this pass, we don't need
    to worry about racing with close(2) or dup(2).
   
 2. Take down shared memory mappings of the inode and close all file
    pointers.

The file descriptors and memory mapping ranges are preserved until the
owning task does close(2) and munmap(2), respectively.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 fs/Makefile                  |    2 
 fs/revoke.c                  |  588 +++++++++++++++++++++++++++++++++++++++++++
 fs/revoked_inode.c           |  378 +++++++++++++++++++++++++++
 include/linux/fs.h           |    4 
 include/linux/revoked_fs_i.h |   20 +
 include/linux/syscalls.h     |    3 
 6 files changed, 994 insertions(+), 1 deletion(-)

Index: uml-2.6/fs/Makefile
===================================================================
--- uml-2.6.orig/fs/Makefile	2007-03-11 13:07:57.000000000 +0200
+++ uml-2.6/fs/Makefile	2007-03-11 13:09:20.000000000 +0200
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o revoke.o revoked_inode.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
Index: uml-2.6/include/linux/syscalls.h
===================================================================
--- uml-2.6.orig/include/linux/syscalls.h	2007-03-11 13:07:57.000000000 +0200
+++ uml-2.6/include/linux/syscalls.h	2007-03-11 13:09:20.000000000 +0200
@@ -605,4 +605,7 @@ asmlinkage long sys_getcpu(unsigned __us
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage int sys_revokeat(int dfd, const char __user *filename);
+asmlinkage int sys_frevoke(unsigned int fd);
+
 #endif
Index: uml-2.6/include/linux/fs.h
===================================================================
--- uml-2.6.orig/include/linux/fs.h	2007-03-11 13:07:57.000000000 +0200
+++ uml-2.6/include/linux/fs.h	2007-03-11 13:09:20.000000000 +0200
@@ -1100,6 +1100,7 @@ struct file_operations {
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*revoke)(struct file *);
 };
 
 struct inode_operations {
@@ -1739,6 +1740,9 @@ extern ssize_t generic_splice_sendpage(s
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		size_t len, unsigned int flags);
 
+/* fs/revoke.c */
+extern int generic_file_revoke(struct file *);
+
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
Index: uml-2.6/fs/revoke.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uml-2.6/fs/revoke.c	2007-03-11 13:14:42.000000000 +0200
@@ -0,0 +1,588 @@
+/*
+ * fs/revoke.c - Invalidate all current open file descriptors of an inode.
+ *
+ * Copyright (C) 2006-2007  Pekka Enberg
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/revoked_fs_i.h>
+
+/*
+ * This is used for pre-allocating an array of file pointers so that we don't
+ * have to do memory allocation under tasklist_lock.
+ */
+struct revoke_table {
+	struct file **files;
+	unsigned long size;
+	unsigned long end;
+	unsigned long restore_start;
+};
+
+struct kmem_cache *revokefs_inode_cache;
+
+/*
+ * Revoked file descriptors point to inodes in the revokefs filesystem.
+ */
+static struct vfsmount *revokefs_mnt;
+
+static struct file *get_revoked_file(void)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *filp;
+	struct qstr name;
+
+	filp = get_empty_filp();
+	if (!filp)
+		goto err;
+
+	inode = new_inode(revokefs_mnt->mnt_sb);
+	if (!inode)
+		goto err_inode;
+
+	name.name = "revoked_file";
+	name.len = strlen(name.name);
+	dentry = d_alloc(revokefs_mnt->mnt_sb->s_root, &name);
+	if (!dentry)
+		goto err_dentry;
+
+	d_instantiate(dentry, inode);
+
+	filp->f_mapping = inode->i_mapping;
+	filp->f_dentry = dget(dentry);
+	filp->f_vfsmnt = mntget(revokefs_mnt);
+	filp->f_op = fops_get(inode->i_fop);
+	filp->f_pos = 0;
+
+	return filp;
+
+  err_dentry:
+	iput(inode);
+  err_inode:
+	fput(filp);
+  err:
+	return NULL;
+}
+
+static inline int inode_matches(struct file *file, struct inode *inode,
+				struct file *to_exclude)
+{
+	return file && file != to_exclude && file->f_dentry->d_inode == inode;
+}
+
+static inline bool revoke_table_is_full(struct revoke_table *table)
+{
+	return table->end == table->size;
+}
+
+static inline struct file *revoke_table_get(struct revoke_table *table)
+{
+	return table->files[table->end++];
+}
+
+/*
+ * 	LOCKING: task_lock(owner)
+ */
+static int revoke_fds(struct task_struct *owner,
+		      struct inode *inode,
+		      struct file *to_exclude, struct revoke_table *table)
+{
+	struct files_struct *files;
+	struct fdtable *fdt;
+	unsigned int fd;
+	int err = 0;
+
+	files = get_files_struct(owner);
+	if (!files)
+		goto out;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+
+	for (fd = 0; fd < fdt->max_fds; fd++) {
+		struct revokefs_inode_info *info;
+		struct file *filp, *new_filp;
+		struct inode *new_inode;
+
+		filp = fcheck_files(files, fd);
+		if (!inode_matches(filp, inode, to_exclude))
+			continue;
+
+		if (!filp->f_op->revoke) {
+			err = -EOPNOTSUPP;
+			goto failed;
+		}
+
+		if (revoke_table_is_full(table)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		new_filp = revoke_table_get(table);
+		get_file(new_filp);
+
+		/*
+		 * Replace original struct file pointer with a pointer to
+		 * a 'revoked file.'  After this point, we don't need to worry
+		 * about racing with sys_close or sys_dup.
+		 */
+		rcu_assign_pointer(fdt->fd[fd], new_filp);
+
+		/*
+		 * Hold on to task until we can take down the file and its
+		 * mmap.
+		 */
+		get_task_struct(owner);
+
+		new_inode = new_filp->f_dentry->d_inode;
+		make_revoked_inode(new_inode, inode->i_mode & S_IFMT);
+
+		info = revokefs_i(new_inode);
+		info->fd = fd;
+		info->file = filp;
+		info->owner = owner;
+	}
+  failed:
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+  out:
+	return err;
+}
+
+static int revoke_vma(struct vm_area_struct *vma, struct zap_details *details)
+{
+	unsigned long restart_addr, start_addr, end_addr;
+	int need_break;
+
+	start_addr = vma->vm_start;
+	end_addr = vma->vm_end;
+
+	/*
+ 	 * Not holding ->mmap_sem here.
+	 */
+	vma->vm_flags |= VM_REVOKED;
+	smp_mb();
+  again:
+	restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr,
+				      details);
+
+	need_break = need_resched() || need_lockbreak(details->i_mmap_lock);
+	if (need_break)
+		goto out_need_break;
+
+	if (restart_addr < end_addr) {
+		start_addr = restart_addr;
+		goto again;
+	}
+	return 0;
+
+  out_need_break:
+	spin_unlock(details->i_mmap_lock);
+	cond_resched();
+	spin_lock(details->i_mmap_lock);
+	return -EINTR;
+}
+
+static int revoke_mapping(struct address_space *mapping, struct file *to_exclude)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	struct zap_details details;
+	int err = 0;
+
+	details.i_mmap_lock = &mapping->i_mmap_lock;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
+		if ((vma->vm_flags & VM_SHARED) && vma->vm_file != to_exclude) {
+			err = revoke_vma(vma, &details);
+			if (err)
+				goto out;
+		}
+	}
+
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
+		if ((vma->vm_flags & VM_SHARED) && vma->vm_file != to_exclude) {
+			err = revoke_vma(vma, &details);
+			if (err)
+				goto out;
+		}
+	}
+  out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return err;
+}
+
+static void restore_file(struct revokefs_inode_info *info)
+{
+	struct files_struct *files;
+
+	files = get_files_struct(info->owner);
+	if (files) {
+		struct fdtable *fdt;
+		struct file *filp;
+
+		spin_lock(&files->file_lock);
+		fdt = files_fdtable(files);
+
+		filp = fdt->fd[info->fd];
+		if (filp)
+			fput(filp);
+
+		rcu_assign_pointer(fdt->fd[info->fd], info->file);
+		FD_SET(info->fd, fdt->close_on_exec);
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	put_task_struct(info->owner);
+	info->owner = NULL;	/* To avoid double-restore. */
+}
+
+static void restore_files(struct revoke_table *table)
+{
+	unsigned long i;
+
+	for (i = table->restore_start; i < table->end; i++) {
+		struct revokefs_inode_info *info;
+		struct file *filp;
+
+		filp = table->files[i];
+		info = revokefs_i(filp->f_dentry->d_inode);
+
+		restore_file(info);
+	}
+}
+
+static int revoke_files(struct revoke_table *table)
+{
+	unsigned long i;
+	int err = 0;
+
+	for (i = 0; i < table->end; i++) {
+		struct revokefs_inode_info *info;
+		struct file *this, *filp;
+
+		this = table->files[i];
+		info = revokefs_i(this->f_dentry->d_inode);
+
+		/*
+		 * Increase count before attempting to close file as
+		 * an partially closed file can no longer be restored.
+		 */
+		table->restore_start++;
+		filp = info->file;
+		err = filp->f_op->revoke(filp);
+		put_task_struct(info->owner);
+		info->owner = NULL;	/* To avoid restoring closed file. */
+		if (err)
+			goto failed;
+	}
+	return 0;
+
+  failed:
+	restore_files(table);
+	return err;
+}
+
+/*
+ *	Returns the maximum number of file descriptors pointing to an inode.
+ *
+ *	LOCKING: read_lock(&tasklist_lock)
+ */
+static unsigned long inode_fds(struct inode *inode, struct file *to_exclude)
+{
+	struct task_struct *g, *p;
+	unsigned long nr_fds = 0;
+
+	do_each_thread(g, p) {
+		struct files_struct *files;
+		struct fdtable *fdt;
+		unsigned int fd;
+
+		files = get_files_struct(p);
+		if (!files)
+			continue;
+
+		spin_lock(&files->file_lock);
+		fdt = files_fdtable(files);
+		for (fd = 0; fd < fdt->max_fds; fd++) {
+			struct file *file;
+
+			file = fcheck_files(files, fd);
+			if (inode_matches(file, inode, to_exclude)) {
+				nr_fds += fdt->max_fds;
+				break;
+			}
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	while_each_thread(g, p);
+	return nr_fds;
+}
+
+static void free_revoke_table(struct revoke_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->size; i++)
+		fput(table->files[i]);
+
+	kfree(table->files);
+	kfree(table);
+}
+
+static struct revoke_table *__alloc_revoke_table(unsigned long size)
+{
+	struct revoke_table *table;
+	int i;
+
+	table = kzalloc(sizeof *table, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->size = size;
+	table->files = kcalloc(size, sizeof(struct file *), GFP_KERNEL);
+	if (!table->files) {
+		kfree(table);
+		return NULL;
+	}
+
+	for (i = 0; i < table->size; i++) {
+		struct file *filp;
+
+		filp = get_revoked_file();
+		if (!filp)
+			goto err;
+
+		table->files[i] = filp;
+	}
+	return table;
+  err:
+	free_revoke_table(table);
+	return NULL;
+}
+
+static struct revoke_table *alloc_revoke_table(struct inode *inode,
+					       struct file *to_exclude)
+{
+	unsigned long nr_fds;
+
+	read_lock(&tasklist_lock);
+	nr_fds = inode_fds(inode, to_exclude);
+	read_unlock(&tasklist_lock);
+
+	return __alloc_revoke_table(nr_fds);
+}
+
+static int do_revoke(struct inode *inode, struct file *to_exclude)
+{
+	struct revoke_table *table = NULL;
+	struct task_struct *g, *p;
+	int err = 0;
+
+	if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) {
+		err = -EPERM;
+		goto out;
+	}
+
+  retry:
+	if (signal_pending(current)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	table = alloc_revoke_table(inode, to_exclude);
+	if (!table) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	read_lock(&tasklist_lock);
+
+	/*
+	 * If someone forked while we were allocating memory, try again.
+	 */
+	if (inode_fds(inode, to_exclude) > table->size) {
+		read_unlock(&tasklist_lock);
+		free_revoke_table(table);
+		goto retry;
+	}
+
+	/*
+	 * First revoke the descriptors. After we are done, no one can start
+	 * new operations on them.
+	 */
+	do_each_thread(g, p) {
+		err = revoke_fds(p, inode, to_exclude, table);
+		if (err)
+			goto exit_loop;
+	}
+	while_each_thread(g, p);
+  exit_loop:
+	read_unlock(&tasklist_lock);
+
+	if (err) {
+		restore_files(table);
+		goto out_free_table;
+	}
+
+	/*
+	 * Take down shared memory mappings.
+	 */
+	err = revoke_mapping(inode->i_mapping, to_exclude);
+	if (err) {
+		restore_files(table);
+		goto out_free_table;
+	}
+
+	/*
+	 * Now, revoke the files for good.
+	 */
+	err = revoke_files(table);
+  out_free_table:
+	free_revoke_table(table);
+  out:
+	return err;
+}
+
+asmlinkage int sys_revokeat(int dfd, const char __user * filename)
+{
+	struct nameidata nd;
+	int err;
+
+	err = __user_walk_fd(dfd, filename, 0, &nd);
+	if (!err) {
+		err = do_revoke(nd.dentry->d_inode, NULL);
+		path_release(&nd);
+	}
+	return err;
+}
+
+asmlinkage int sys_frevoke(unsigned int fd)
+{
+	struct file *file = fget(fd);
+	int err = -EBADF;
+
+	if (file) {
+		err = do_revoke(file->f_dentry->d_inode, file);
+		fput(file);
+	}
+	return err;
+}
+
+int generic_file_revoke(struct file *file)
+{
+	int err;
+
+	/*
+	 * Flush pending writes.
+	 */
+	err = do_fsync(file, 1);
+	if (err)
+		goto out;
+
+	/*
+	 * Make pending reads fail.
+	 */
+	err = invalidate_inode_pages2(file->f_mapping);
+
+  out:
+	return err;
+}
+
+EXPORT_SYMBOL(generic_file_revoke);
+
+/*
+ *	Filesystem for revoked files.
+ */
+
+static struct inode *revokefs_alloc_inode(struct super_block *sb)
+{
+	struct revokefs_inode_info *info;
+
+	info = kmem_cache_alloc(revokefs_inode_cache, GFP_NOFS);
+	if (!info)
+		return NULL;
+
+	return &info->vfs_inode;
+}
+
+static void revokefs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(revokefs_inode_cache, revokefs_i(inode));
+}
+
+static struct super_operations revokefs_super_ops = {
+	.alloc_inode = revokefs_alloc_inode,
+	.destroy_inode = revokefs_destroy_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static int revokefs_get_sb(struct file_system_type *fs_type,
+			   int flags, const char *dev_name, void *data,
+			   struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "revoke:", &revokefs_super_ops,
+			     REVOKEFS_MAGIC, mnt);
+}
+
+struct file_system_type revokefs_fs_type = {
+	.name = "revokefs",
+	.get_sb = revokefs_get_sb,
+	.kill_sb = kill_anon_super
+};
+
+static void revokefs_init_inode(void *obj, struct kmem_cache *cache,
+				unsigned long flags)
+{
+	struct revokefs_inode_info *info = obj;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		info->owner = NULL;
+		inode_init_once(&info->vfs_inode);
+	}
+}
+
+static int __init revokefs_init(void)
+{
+	int err = -ENOMEM;
+
+	revokefs_inode_cache =
+	    kmem_cache_create("revokefs_inode_cache",
+			      sizeof(struct revokefs_inode_info),
+			      0,
+			      (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+			       SLAB_MEM_SPREAD), revokefs_init_inode, NULL);
+	if (!revokefs_inode_cache)
+		goto out;
+
+	err = register_filesystem(&revokefs_fs_type);
+	if (err)
+		goto err_register;
+
+	revokefs_mnt = kern_mount(&revokefs_fs_type);
+	if (IS_ERR(revokefs_mnt)) {
+		err = PTR_ERR(revokefs_mnt);
+		goto err_mnt;
+	}
+  out:
+	return err;
+  err_mnt:
+	unregister_filesystem(&revokefs_fs_type);
+  err_register:
+	kmem_cache_destroy(revokefs_inode_cache);
+	return err;
+}
+
+late_initcall(revokefs_init);
Index: uml-2.6/fs/revoked_inode.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uml-2.6/fs/revoked_inode.c	2007-03-11 13:09:20.000000000 +0200
@@ -0,0 +1,378 @@
+/*
+ * fs/revoked_inode.c
+ *
+ * Copyright (C) 2007  Pekka Enberg
+ *
+ * Provide stub functions for revoked inodes. Based on fs/bad_inode.c which is
+ *
+ * Copyright (C) 1997  Stephen Tweedie
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/smp_lock.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/revoked_fs_i.h>
+
+static loff_t revoked_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_read(struct file *filp, char __user * buf,
+				 size_t size, loff_t * ppos)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_special_file_read(struct file *filp, char __user * buf,
+					 size_t size, loff_t * ppos)
+{
+	return 0;
+}
+
+static ssize_t revoked_file_write(struct file *filp, const char __user * buf,
+				  size_t siz, loff_t * ppos)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_aio_read(struct kiocb *iocb,
+				     const struct iovec *iov,
+				     unsigned long nr_segs, loff_t pos)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_aio_write(struct kiocb *iocb,
+				      const struct iovec *iov,
+				      unsigned long nr_segs, loff_t pos)
+{
+	return -EBADF;
+}
+
+static int revoked_file_readdir(struct file *filp, void *dirent,
+				filldir_t filldir)
+{
+	return -EBADF;
+}
+
+static unsigned int revoked_file_poll(struct file *filp, poll_table * wait)
+{
+	return POLLERR;
+}
+
+static int revoked_file_ioctl(struct inode *inode, struct file *filp,
+			      unsigned int cmd, unsigned long arg)
+{
+	return -EBADF;
+}
+
+static long revoked_file_unlocked_ioctl(struct file *file, unsigned cmd,
+					unsigned long arg)
+{
+	return -EBADF;
+}
+
+static long revoked_file_compat_ioctl(struct file *file, unsigned int cmd,
+				      unsigned long arg)
+{
+	return -EBADF;
+}
+
+static int revoked_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return -EBADF;
+}
+
+static int revoked_file_open(struct inode *inode, struct file *filp)
+{
+	return -EBADF;
+}
+
+static int revoked_file_flush(struct file *file, fl_owner_t id)
+{
+	return filp_close(file, id);
+}
+
+static int revoked_file_release(struct inode *inode, struct file *filp)
+{
+	return -EBADF;
+}
+
+static int revoked_file_fsync(struct file *file, struct dentry *dentry,
+			      int datasync)
+{
+	return -EBADF;
+}
+
+static int revoked_file_aio_fsync(struct kiocb *iocb, int datasync)
+{
+	return -EBADF;
+}
+
+static int revoked_file_fasync(int fd, struct file *filp, int on)
+{
+	return -EBADF;
+}
+
+static int revoked_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_sendfile(struct file *in_file, loff_t * ppos,
+				     size_t count, read_actor_t actor,
+				     void *target)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_sendpage(struct file *file, struct page *page,
+				     int off, size_t len, loff_t * pos,
+				     int more)
+{
+	return -EBADF;
+}
+
+static unsigned long revoked_file_get_unmapped_area(struct file *file,
+						    unsigned long addr,
+						    unsigned long len,
+						    unsigned long pgoff,
+						    unsigned long flags)
+{
+	return -EBADF;
+}
+
+static int revoked_file_check_flags(int flags)
+{
+	return -EBADF;
+}
+
+static int revoked_file_dir_notify(struct file *file, unsigned long arg)
+{
+	return -EBADF;
+}
+
+static int revoked_file_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t * ppos,
+					 size_t len, unsigned int flags)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_file_splice_read(struct file *in, loff_t * ppos,
+					struct pipe_inode_info *pipe,
+					size_t len, unsigned int flags)
+{
+	return -EBADF;
+}
+
+static const struct file_operations revoked_file_ops = {
+	.llseek = revoked_file_llseek,
+	.read = revoked_file_read,
+	.write = revoked_file_write,
+	.aio_read = revoked_file_aio_read,
+	.aio_write = revoked_file_aio_write,
+	.readdir = revoked_file_readdir,
+	.poll = revoked_file_poll,
+	.ioctl = revoked_file_ioctl,
+	.unlocked_ioctl = revoked_file_unlocked_ioctl,
+	.compat_ioctl = revoked_file_compat_ioctl,
+	.mmap = revoked_file_mmap,
+	.open = revoked_file_open,
+	.flush = revoked_file_flush,
+	.release = revoked_file_release,
+	.fsync = revoked_file_fsync,
+	.aio_fsync = revoked_file_aio_fsync,
+	.fasync = revoked_file_fasync,
+	.lock = revoked_file_lock,
+	.sendfile = revoked_file_sendfile,
+	.sendpage = revoked_file_sendpage,
+	.get_unmapped_area = revoked_file_get_unmapped_area,
+	.check_flags = revoked_file_check_flags,
+	.dir_notify = revoked_file_dir_notify,
+	.flock = revoked_file_flock,
+	.splice_write = revoked_file_splice_write,
+	.splice_read = revoked_file_splice_read,
+};
+
+static const struct file_operations revoked_special_file_ops = {
+	.llseek = revoked_file_llseek,
+	.read = revoked_special_file_read,
+	.write = revoked_file_write,
+	.aio_read = revoked_file_aio_read,
+	.aio_write = revoked_file_aio_write,
+	.readdir = revoked_file_readdir,
+	.poll = revoked_file_poll,
+	.ioctl = revoked_file_ioctl,
+	.unlocked_ioctl = revoked_file_unlocked_ioctl,
+	.compat_ioctl = revoked_file_compat_ioctl,
+	.mmap = revoked_file_mmap,
+	.open = revoked_file_open,
+	.flush = revoked_file_flush,
+	.release = revoked_file_release,
+	.fsync = revoked_file_fsync,
+	.aio_fsync = revoked_file_aio_fsync,
+	.fasync = revoked_file_fasync,
+	.lock = revoked_file_lock,
+	.sendfile = revoked_file_sendfile,
+	.sendpage = revoked_file_sendpage,
+	.get_unmapped_area = revoked_file_get_unmapped_area,
+	.check_flags = revoked_file_check_flags,
+	.dir_notify = revoked_file_dir_notify,
+	.flock = revoked_file_flock,
+	.splice_write = revoked_file_splice_write,
+	.splice_read = revoked_file_splice_read,
+};
+
+static int revoked_inode_create(struct inode *dir, struct dentry *dentry,
+				int mode, struct nameidata *nd)
+{
+	return -EBADF;
+}
+
+static struct dentry *revoked_inode_lookup(struct inode *dir,
+					   struct dentry *dentry,
+					   struct nameidata *nd)
+{
+	return ERR_PTR(-EBADF);
+}
+
+static int revoked_inode_link(struct dentry *old_dentry, struct inode *dir,
+			      struct dentry *dentry)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_symlink(struct inode *dir, struct dentry *dentry,
+				 const char *symname)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_mkdir(struct inode *dir, struct dentry *dentry,
+			       int mode)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_mknod(struct inode *dir, struct dentry *dentry,
+			       int mode, dev_t rdev)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_rename(struct inode *old_dir,
+				struct dentry *old_dentry,
+				struct inode *new_dir,
+				struct dentry *new_dentry)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_readlink(struct dentry *dentry, char __user * buffer,
+				  int buflen)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_permission(struct inode *inode, int mask,
+				    struct nameidata *nd)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_setxattr(struct dentry *dentry, const char *name,
+				  const void *value, size_t size, int flags)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_inode_getxattr(struct dentry *dentry, const char *name,
+				      void *buffer, size_t size)
+{
+	return -EBADF;
+}
+
+static ssize_t revoked_inode_listxattr(struct dentry *dentry, char *buffer,
+				       size_t buffer_size)
+{
+	return -EBADF;
+}
+
+static int revoked_inode_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EBADF;
+}
+
+static struct inode_operations revoked_inode_ops = {
+	.create = revoked_inode_create,
+	.lookup = revoked_inode_lookup,
+	.link = revoked_inode_link,
+	.unlink = revoked_inode_unlink,
+	.symlink = revoked_inode_symlink,
+	.mkdir = revoked_inode_mkdir,
+	.rmdir = revoked_inode_rmdir,
+	.mknod = revoked_inode_mknod,
+	.rename = revoked_inode_rename,
+	.readlink = revoked_inode_readlink,
+	/* follow_link must be no-op, otherwise unmounting this inode
+	   won't work */
+	/* put_link returns void */
+	/* truncate returns void */
+	.permission = revoked_inode_permission,
+	.getattr = revoked_inode_getattr,
+	.setattr = revoked_inode_setattr,
+	.setxattr = revoked_inode_setxattr,
+	.getxattr = revoked_inode_getxattr,
+	.listxattr = revoked_inode_listxattr,
+	.removexattr = revoked_inode_removexattr,
+	/* truncate_range returns void */
+};
+
+void make_revoked_inode(struct inode *inode, int mode)
+{
+	remove_inode_hash(inode);
+
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime =
+	    current_fs_time(inode->i_sb);
+	inode->i_op = &revoked_inode_ops;
+
+	if (special_file(mode))
+		inode->i_fop = &revoked_special_file_ops;
+	else
+		inode->i_fop = &revoked_file_ops;
+}
Index: uml-2.6/include/linux/revoked_fs_i.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uml-2.6/include/linux/revoked_fs_i.h	2007-03-11 13:09:20.000000000 +0200
@@ -0,0 +1,20 @@
+#ifndef _LINUX_REVOKED_FS_I_H
+#define _LINUX_REVOKED_FS_I_H
+
+#define REVOKEFS_MAGIC 0x5245564B      /* REVK */
+
+struct revokefs_inode_info {
+	struct task_struct *owner;
+	struct file *file;
+	unsigned int fd;
+	struct inode vfs_inode;
+};
+
+static inline struct revokefs_inode_info *revokefs_i(struct inode *inode)
+{
+	return container_of(inode, struct revokefs_inode_info, vfs_inode);
+}
+
+void make_revoked_inode(struct inode *, int);
+
+#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-11 11:30 [PATCH 2/5] revoke: core code Pekka J Enberg
@ 2007-03-16  1:34 ` Andrew Morton
  2007-03-16  6:44   ` Pekka Enberg
  0 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2007-03-16  1:34 UTC (permalink / raw)
  To: Pekka J Enberg; +Cc: linux-kernel, hch, alan

On Sun, 11 Mar 2007 13:30:49 +0200 (EET) Pekka J Enberg <penberg@cs.helsinki.fi> wrote:

> From: Pekka Enberg <penberg@cs.helsinki.fi>
> 
> The revokeat(2) and frevoke(2) system calls invalidate open file
> descriptors and shared mappings of an inode. After an successful
> revocation, operations on file descriptors fail with the EBADF or
> ENXIO error code for regular and device files,
> respectively. Attempting to read from or write to a revoked mapping
> causes SIGBUS.
> 
> The actual operation is done in two passes:
> 
>  1. Revoke all file descriptors that point to the given inode. We do
>     this under tasklist_lock so that after this pass, we don't need
>     to worry about racing with close(2) or dup(2).
>    
>  2. Take down shared memory mappings of the inode and close all file
>     pointers.
> 
> The file descriptors and memory mapping ranges are preserved until the
> owning task does close(2) and munmap(2), respectively.
> 
> ...
>
> +asmlinkage int sys_revokeat(int dfd, const char __user *filename);
> +asmlinkage int sys_frevoke(unsigned int fd);

noooo.... all system calls must return long.

> +static int revoke_vma(struct vm_area_struct *vma, struct zap_details *details)
> +{
> +	unsigned long restart_addr, start_addr, end_addr;
> +	int need_break;
> +
> +	start_addr = vma->vm_start;
> +	end_addr = vma->vm_end;
> +
> +	/*
> + 	 * Not holding ->mmap_sem here.
> +	 */
> +	vma->vm_flags |= VM_REVOKED;

so....  the modification of vm_flags is racy?

> +	smp_mb();

Please always document barriers.  There's presumably some vm_flags reader
we're concerned about here, but how is the code reader to know what the
code writer was thinking?


> +  again:
> +	restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr,
> +				      details);
> +
> +	need_break = need_resched() || need_lockbreak(details->i_mmap_lock);
> +	if (need_break)
> +		goto out_need_break;
> +
> +	if (restart_addr < end_addr) {
> +		start_addr = restart_addr;
> +		goto again;
> +	}
> +	return 0;
> +
> +  out_need_break:
> +	spin_unlock(details->i_mmap_lock);
> +	cond_resched();
> +	spin_lock(details->i_mmap_lock);
> +	return -EINTR;
> +}
> +
> +static int revoke_mapping(struct address_space *mapping, struct file *to_exclude)
> +{
> +	struct vm_area_struct *vma;
> +	struct prio_tree_iter iter;
> +	struct zap_details details;
> +	int err = 0;
> +
> +	details.i_mmap_lock = &mapping->i_mmap_lock;
> +
> +	spin_lock(&mapping->i_mmap_lock);
> +	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
> +		if ((vma->vm_flags & VM_SHARED) && vma->vm_file != to_exclude) {
> +			err = revoke_vma(vma, &details);
> +			if (err)
> +				goto out;
> +		}
> +	}
> +
> +	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
> +		if ((vma->vm_flags & VM_SHARED) && vma->vm_file != to_exclude) {
> +			err = revoke_vma(vma, &details);
> +			if (err)
> +				goto out;
> +		}
> +	}
> +  out:
> +	spin_unlock(&mapping->i_mmap_lock);
> +	return err;
> +}

This all looks very strange.  If the calling process expires its timeslice,
the entire system call fails?

What's happening here?


> +
> +int generic_file_revoke(struct file *file)
> +{
> +	int err;
> +
> +	/*
> +	 * Flush pending writes.
> +	 */
> +	err = do_fsync(file, 1);
> +	if (err)
> +		goto out;
> +
> +	/*
> +	 * Make pending reads fail.
> +	 */
> +	err = invalidate_inode_pages2(file->f_mapping);
> +
> +  out:
> +	return err;
> +}
> +
> +EXPORT_SYMBOL(generic_file_revoke);

do_fsync() is seriously suboptimal - it will run an ext3 commit. 
do_sync_file_range(...,
SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER)
will run maybe five times quicker.

But otoh, do_sync_file_range() will fail to write back the pages for a
data=journal ext3 file, I expect (oops).


Why is this code using invalidate_inode_pages2()?  That function keeps on
breaking, has ill-defined semantics and will probably change in the future.

Exactly what semantics are you looking for here, and why?

The blank line before the EXPORT_SYMBOL() is a waste of space.

> +/*
> + *	Filesystem for revoked files.
> + */
> +
> +static struct inode *revokefs_alloc_inode(struct super_block *sb)
> +{
> +	struct revokefs_inode_info *info;
> +
> +	info = kmem_cache_alloc(revokefs_inode_cache, GFP_NOFS);
> +	if (!info)
> +		return NULL;
> +
> +	return &info->vfs_inode;
> +}

Why GFP_NOFS?

> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ uml-2.6/include/linux/revoked_fs_i.h	2007-03-11 13:09:20.000000000 +0200
> @@ -0,0 +1,20 @@
> +#ifndef _LINUX_REVOKED_FS_I_H
> +#define _LINUX_REVOKED_FS_I_H
> +
> +#define REVOKEFS_MAGIC 0x5245564B      /* REVK */

This is supposed to go into magic.h.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16  1:34 ` Andrew Morton
@ 2007-03-16  6:44   ` Pekka Enberg
  2007-03-16 11:26     ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Pekka Enberg @ 2007-03-16  6:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch, alan

Hi Andrew,

On Sun, 11 Mar 2007 13:30:49 +0200 (EET) Pekka J Enberg
<penberg@cs.helsinki.fi> wrote:

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> noooo.... all system calls must return long.

Fixed.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> so....  the modification of vm_flags is racy?
>
> > +     smp_mb();
>
> Please always document barriers.  There's presumably some vm_flags reader
> we're concerned about here, but how is the code reader to know what the
> code writer was thinking?

We're need to watch out for page faults after the shared mappings have
been taken down and mmap(2) trying to remap. I'll add a comment here.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> This all looks very strange.  If the calling process expires its timeslice,
> the entire system call fails?
>
> What's happening here?

Me being stupid. I followed what unmap_mapping_range_vma is doing but
failed to see what its callers are doing. I'll fix it up.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> do_fsync() is seriously suboptimal - it will run an ext3 commit.
> do_sync_file_range(...,
> SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER)
> will run maybe five times quicker.
>
> But otoh, do_sync_file_range() will fail to write back the pages for a
> data=journal ext3 file, I expect (oops).

But it's good enough for generic_file_revoke, no? Ext3 should probably
implement it's own revoke hook so you can drop the ext2 and ext3 hooks
if you're worried, I did them mostly for testing.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> Why is this code using invalidate_inode_pages2()?  That function keeps on
> breaking, has ill-defined semantics and will probably change in the future.
>
> Exactly what semantics are you looking for here, and why?

What the comment says "make pending reads fail." When revoking an
inode, we need to make sure there are no pending I/O that will
complete after revocation and thus leak information.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> The blank line before the EXPORT_SYMBOL() is a waste of space.

I'll fix that up.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> > +static struct inode *revokefs_alloc_inode(struct super_block *sb)
> > +{
> > +     struct revokefs_inode_info *info;
> > +
> > +     info = kmem_cache_alloc(revokefs_inode_cache, GFP_NOFS);
> > +     if (!info)
> > +             return NULL;
> > +
> > +     return &info->vfs_inode;
> > +}
>
> Why GFP_NOFS?

GFP_KERNEL should be sufficient. I'll fix that up.

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> > ===================================================================
> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ uml-2.6/include/linux/revoked_fs_i.h      2007-03-11 13:09:20.000000000 +0200
> > @@ -0,0 +1,20 @@
> > +#ifndef _LINUX_REVOKED_FS_I_H
> > +#define _LINUX_REVOKED_FS_I_H
> > +
> > +#define REVOKEFS_MAGIC 0x5245564B      /* REVK */
>
> This is supposed to go into magic.h.

Will do. Thank you Andrew.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16  6:44   ` Pekka Enberg
@ 2007-03-16 11:26     ` Andrew Morton
  2007-03-16 11:44       ` Pekka J Enberg
  0 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2007-03-16 11:26 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: linux-kernel, hch, alan

On Fri, 16 Mar 2007 08:44:46 +0200 "Pekka Enberg" <penberg@cs.helsinki.fi> wrote:

> On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> > Why is this code using invalidate_inode_pages2()?  That function keeps on
> > breaking, has ill-defined semantics and will probably change in the future.
> >
> > Exactly what semantics are you looking for here, and why?
> 
> What the comment says "make pending reads fail." When revoking an
> inode, we need to make sure there are no pending I/O that will
> complete after revocation and thus leak information.

hm, let's define "pending".

I assume that any future callers to sys_read() will reliably do the right
thing at this stage, so we are concerned with threads which are presently
partway through a read from this inode?

If that's not accurate then please describe with some detail exactly what
semantics you're looking for here.

If it _is_ accurate then hm, tricky.  It all rather depends upon how the
relevant filesystem implements reading (and writing?).  Which is why you
made it a file_operation, fair enough.

But even for ext2 and ext3 (please keep ext4 in sync with ext3 changes,
btw), if some process is partway through a big page_cache_readahead()
operation then a concurrent invalidate_inode_pages2() call won't worry it
at all: the pagecache will be reinstantiated and do_generic_mapping_read()
will proceed to copy that pagecache out to the user after the revoke() has
returned.  I think.

I'm afraid I havent paid any attention to this revoke proposal before, I
don't understand the usecases nor the implementation details so things
which are implicitly-obvious-to-you must be explained to me.  But others
will benefit from that explanation too ;)  What, exactly, are we trying to do
with the already-opened files and the currently-in-progress syscalls?

(A concurrent direct-io read might be a problem too?)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 11:26     ` Andrew Morton
@ 2007-03-16 11:44       ` Pekka J Enberg
  2007-03-16 12:26         ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Pekka J Enberg @ 2007-03-16 11:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch, alan

On Fri, 16 Mar 2007, Andrew Morton wrote:
> I assume that any future callers to sys_read() will reliably do the right
> thing at this stage, so we are concerned with threads which are presently
> partway through a read from this inode?

Yes. We first revoke the file descriptors under tasklist_lock after which 
any new operations on the revoked inode descriptors fail.

On Fri, 16 Mar 2007, Andrew Morton wrote:
> If it _is_ accurate then hm, tricky.  It all rather depends upon how the
> relevant filesystem implements reading (and writing?).  Which is why you
> made it a file_operation, fair enough.

Yeah, filesystem dependent. Writes are not a problem as do_sync() will 
wait until the writers are done. For that part, generic_file_revoke() 
should be fine although suboptimal for most filesystems.

On Fri, 16 Mar 2007, Andrew Morton wrote:
> But even for ext2 and ext3 (please keep ext4 in sync with ext3 changes,
> btw), if some process is partway through a big page_cache_readahead()
> operation then a concurrent invalidate_inode_pages2() call won't worry it
> at all: the pagecache will be reinstantiated and do_generic_mapping_read()
> will proceed to copy that pagecache out to the user after the revoke() has
> returned.  I think.

That's bad. Can we perhaps wait until readers are done?

On Fri, 16 Mar 2007, Andrew Morton wrote: 
> I'm afraid I havent paid any attention to this revoke proposal before, I
> don't understand the usecases nor the implementation details so things
> which are implicitly-obvious-to-you must be explained to me.  But others
> will benefit from that explanation too ;)  What, exactly, are we trying to do
> with the already-opened files and the currently-in-progress syscalls?

You use revoke() (with chown, for example) to gain exclusive access to 
an inode that might be in use by other processes. This means that we must 
mke sure that:

  - operations on opened file descriptors pointing to that inode fail
  - there are no shared mappings visible to other processes
  - in-progress system calls are either completed (writes) or abort 
    (reads)

After revoke() system call returns, you are guaranteed to have revoked 
access to an inode for any processes that had access to it when you 
started the operation. The caller is responsible for blocking any future 
open(2) calls that might occur while revoke() takes care of fork(2) and 
dup(2) during the operation.

On Fri, 16 Mar 2007, Andrew Morton wrote: 
> (A concurrent direct-io read might be a problem too?)

Good point. We would need to take down those too.

			Pekka

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 11:44       ` Pekka J Enberg
@ 2007-03-16 12:26         ` Andrew Morton
  2007-03-16 14:45           ` Pekka J Enberg
                             ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Andrew Morton @ 2007-03-16 12:26 UTC (permalink / raw)
  To: Pekka J Enberg; +Cc: linux-kernel, hch, alan

On Fri, 16 Mar 2007 13:44:27 +0200 (EET) Pekka J Enberg <penberg@cs.helsinki.fi> wrote:

> On Fri, 16 Mar 2007, Andrew Morton wrote:
> > I assume that any future callers to sys_read() will reliably do the right
> > thing at this stage, so we are concerned with threads which are presently
> > partway through a read from this inode?
> 
> Yes. We first revoke the file descriptors under tasklist_lock after which 
> any new operations on the revoked inode descriptors fail.

OK.

> On Fri, 16 Mar 2007, Andrew Morton wrote:
> > If it _is_ accurate then hm, tricky.  It all rather depends upon how the
> > relevant filesystem implements reading (and writing?).  Which is why you
> > made it a file_operation, fair enough.
> 
> Yeah, filesystem dependent. Writes are not a problem as do_sync() will 
> wait until the writers are done. For that part, generic_file_revoke() 
> should be fine although suboptimal for most filesystems.

I'm not sure that running do_fsync() will guarantee that all sys_write()
callers will have finished their syscall.  Probably they will have, in
practice.  But there is logic in the sync paths to deliberately bale out
if we're competing with ongoing dirtyings, to avoid livelocking.

> On Fri, 16 Mar 2007, Andrew Morton wrote:
> > But even for ext2 and ext3 (please keep ext4 in sync with ext3 changes,
> > btw), if some process is partway through a big page_cache_readahead()
> > operation then a concurrent invalidate_inode_pages2() call won't worry it
> > at all: the pagecache will be reinstantiated and do_generic_mapping_read()
> > will proceed to copy that pagecache out to the user after the revoke() has
> > returned.  I think.
> 
> That's bad. Can we perhaps wait until readers are done?

Gee.  The only way I can think of guaranteeing this is with a full-on
freeze_processes/thaw_processes, which is the biggest synchronisation
barrier we have, apart from sys_reboot().

Now it just so happens that for ext2/3/4-style filesystems, we re-check
i_size inside the inner loop to handle concurrent truncates (see
i_size_read() calls in do_generic_mapping_read().

Perhaps the revoke() code can hook into there in some fashion to tell the
process-which-is-presently-running-read() that it is now reading crap.

That still doesn't give us a way of making revoke() wait until all
read()ers have finished their reads.

What you're trying to do here is very similar to truncate(), and truncate()
has had a lot of work put into it, and it does work.  So if revoke() were
to

a) grab i_mutex to keep write()s away
b) set i_size to zero
c) run truncate_inode_pages()

then I expect that you'll have the guarantees which you need.  This is
because the read() caller will synchronise against revoke() at each
lock_page(), and the read() caller will check i_size prior to locking each
page.

However, modifying i_size like this might be a problem - the inode could be
dirty and it'll get written to disk!  Perhaps we could change i_size_read()
to cheat and to return zero if there's a revoke in progress.

> On Fri, 16 Mar 2007, Andrew Morton wrote: 
> > I'm afraid I havent paid any attention to this revoke proposal before, I
> > don't understand the usecases nor the implementation details so things
> > which are implicitly-obvious-to-you must be explained to me.  But others
> > will benefit from that explanation too ;)  What, exactly, are we trying to do
> > with the already-opened files and the currently-in-progress syscalls?
> 
> You use revoke() (with chown, for example) to gain exclusive access to 
> an inode that might be in use by other processes. This means that we must 
> mke sure that:
> 
>   - operations on opened file descriptors pointing to that inode fail
>   - there are no shared mappings visible to other processes
>   - in-progress system calls are either completed (writes) or abort 
>     (reads)
> 
> After revoke() system call returns, you are guaranteed to have revoked 
> access to an inode for any processes that had access to it when you 
> started the operation. The caller is responsible for blocking any future 
> open(2) calls that might occur while revoke() takes care of fork(2) and 
> dup(2) during the operation.

<adds that to the changelog>

> On Fri, 16 Mar 2007, Andrew Morton wrote: 
> > (A concurrent direct-io read might be a problem too?)
> 
> Good point. We would need to take down those too.
> 

direct-io caches i_size for the whole operation and sometimes re-reads it. 
It does funky things to handle concurrent reads and writes.  Probably for
the DIO_LOCKING case we're OK, as direct-io has to re-check i_size_read()
after reacquiring i_mutex.  It's complex, but again, our path to success
here will be to piggyback on the existing handling of truncation.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 12:26         ` Andrew Morton
@ 2007-03-16 14:45           ` Pekka J Enberg
  2007-03-16 14:58           ` Alan Cox
  2007-03-16 20:37           ` Pekka Enberg
  2 siblings, 0 replies; 13+ messages in thread
From: Pekka J Enberg @ 2007-03-16 14:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch, alan

On Fri, 16 Mar 2007, Andrew Morton wrote:
> What you're trying to do here is very similar to truncate(), and truncate()
> has had a lot of work put into it, and it does work.

Indeed. revoke() is the same as truncate() without, well, the truncation 
part.

On Fri, 16 Mar 2007, Andrew Morton wrote:
> However, modifying i_size like this might be a problem - the inode could be
> dirty and it'll get written to disk!  Perhaps we could change i_size_read()
> to cheat and to return zero if there's a revoke in progress.

Hmph, it's probably going to get too painful so I'll look at adding some 
hooks to do_generic_mapping_read()...

			Pekka

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 12:26         ` Andrew Morton
  2007-03-16 14:45           ` Pekka J Enberg
@ 2007-03-16 14:58           ` Alan Cox
  2007-03-16 14:27             ` Pekka Enberg
  2007-03-16 14:30             ` Pekka Enberg
  2007-03-16 20:37           ` Pekka Enberg
  2 siblings, 2 replies; 13+ messages in thread
From: Alan Cox @ 2007-03-16 14:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Pekka J Enberg, linux-kernel, hch

> I'm not sure that running do_fsync() will guarantee that all sys_write()
> callers will have finished their syscall.  Probably they will have, in
> practice.  But there is logic in the sync paths to deliberately bale out
> if we're competing with ongoing dirtyings, to avoid livelocking.

For device files you really need to call into the device driver for this
(->flush etc).

> However, modifying i_size like this might be a problem - the inode could be
> dirty and it'll get written to disk!  Perhaps we could change i_size_read()
> to cheat and to return zero if there's a revoke in progress.

The cheating is a bit messier than that - you might be revoking on a
cluster file system and I'm still trying to get my head around what the
semantics for that are. Lying about sizes will break the coherency
protocols I think

Serious question - do we actually need revoke() on a normal file ? BSD
has never had this, SYS5 has never had this.

Alan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 14:58           ` Alan Cox
@ 2007-03-16 14:27             ` Pekka Enberg
  2007-03-16 14:46               ` Pekka Enberg
  2007-03-16 14:30             ` Pekka Enberg
  1 sibling, 1 reply; 13+ messages in thread
From: Pekka Enberg @ 2007-03-16 14:27 UTC (permalink / raw)
  To: Alan Cox; +Cc: Andrew Morton, linux-kernel, hch

On 3/16/07, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> Serious question - do we actually need revoke() on a normal file ? BSD
> has never had this, SYS5 has never had this.

It's needed for forced unmount (bits of it anyway) and
partial-revocation in SLIM.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 14:27             ` Pekka Enberg
@ 2007-03-16 14:46               ` Pekka Enberg
  0 siblings, 0 replies; 13+ messages in thread
From: Pekka Enberg @ 2007-03-16 14:46 UTC (permalink / raw)
  To: Alan Cox; +Cc: Andrew Morton, linux-kernel, hch

On 3/16/07, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > Serious question - do we actually need revoke() on a normal file ? BSD
> > has never had this, SYS5 has never had this.

On 3/16/07, Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> It's needed for forced unmount (bits of it anyway) and
> partial-revocation in SLIM.

And btw, you do need support for tearing down mmap for device files too, right?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 14:58           ` Alan Cox
  2007-03-16 14:27             ` Pekka Enberg
@ 2007-03-16 14:30             ` Pekka Enberg
  1 sibling, 0 replies; 13+ messages in thread
From: Pekka Enberg @ 2007-03-16 14:30 UTC (permalink / raw)
  To: Alan Cox; +Cc: Andrew Morton, linux-kernel, hch

On 3/16/07, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > I'm not sure that running do_fsync() will guarantee that all sys_write()
> > callers will have finished their syscall.  Probably they will have, in
> > practice.  But there is logic in the sync paths to deliberately bale out
> > if we're competing with ongoing dirtyings, to avoid livelocking.
>
> For device files you really need to call into the device driver for this
> (->flush etc).

Sure but the do_fsync() bits are part of generic_file_revoke() which
is not meant for device files at all.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 12:26         ` Andrew Morton
  2007-03-16 14:45           ` Pekka J Enberg
  2007-03-16 14:58           ` Alan Cox
@ 2007-03-16 20:37           ` Pekka Enberg
  2007-03-16 20:54             ` Andrew Morton
  2 siblings, 1 reply; 13+ messages in thread
From: Pekka Enberg @ 2007-03-16 20:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, hch, alan

On 3/16/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> However, modifying i_size like this might be a problem - the inode could be
> dirty and it'll get written to disk!  Perhaps we could change i_size_read()
> to cheat and to return zero if there's a revoke in progress.

I don't think we can actually abuse i_size_read() in any sane manner
because the inode needs to be usable for anyone who just did open(2)
after revoke or whoever called frevoke(2).

What we could do is add a "I am revoked" flag to struct file which
blocks any future ->readpage, ->readpages, and ->direct_IO on the
file. Alternatively, we could change the ->f_mapping to point to an
address space that has "revoked address space" operations. Hmm.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] revoke: core code
  2007-03-16 20:37           ` Pekka Enberg
@ 2007-03-16 20:54             ` Andrew Morton
  0 siblings, 0 replies; 13+ messages in thread
From: Andrew Morton @ 2007-03-16 20:54 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: linux-kernel, hch, alan, Al Viro

On Fri, 16 Mar 2007 22:37:57 +0200 "Pekka Enberg" <penberg@cs.helsinki.fi> wrote:

> What we could do is add a "I am revoked" flag to struct file which
> blocks any future ->readpage, ->readpages, and ->direct_IO on the
> file. Alternatively, we could change the ->f_mapping to point to an
> address space that has "revoked address space" operations. Hmm.

iirc, that was part of the rationale for the introduction of f_mapping.

I'll cc he-who-never-replies, who did that work.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-03-16 20:54 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-11 11:30 [PATCH 2/5] revoke: core code Pekka J Enberg
2007-03-16  1:34 ` Andrew Morton
2007-03-16  6:44   ` Pekka Enberg
2007-03-16 11:26     ` Andrew Morton
2007-03-16 11:44       ` Pekka J Enberg
2007-03-16 12:26         ` Andrew Morton
2007-03-16 14:45           ` Pekka J Enberg
2007-03-16 14:58           ` Alan Cox
2007-03-16 14:27             ` Pekka Enberg
2007-03-16 14:46               ` Pekka Enberg
2007-03-16 14:30             ` Pekka Enberg
2007-03-16 20:37           ` Pekka Enberg
2007-03-16 20:54             ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox