* [PATCH 01/25] filesystem helpers for custom 'struct file's
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:52 ` [PATCH 02/25] rearrange may_open() to be r/o friendly Dave Hansen
` (23 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Christoph H. says this stands on its own and can go in before the
rest of the r/o bind mount set.
---
Some filesystems forego the vfs and may_open() and create their
own 'struct file's.
This patch creates a couple of helper functions which can be
used by these filesystems, and will provide a unified place
which the r/o bind mount code may patch.
Also, rename an existing, static-scope init_file() to a less
generic name.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/configfs/dir.c | 5 ++-
lxc-dave/fs/file_table.c | 60 ++++++++++++++++++++++++++++++++++++++++++
lxc-dave/fs/hugetlbfs/inode.c | 22 ++++++---------
lxc-dave/include/linux/file.h | 9 ++++++
lxc-dave/ipc/shm.c | 13 +++------
lxc-dave/mm/shmem.c | 7 +---
lxc-dave/mm/tiny-shmem.c | 19 ++++---------
lxc-dave/net/socket.c | 18 ++++++------
8 files changed, 104 insertions(+), 49 deletions(-)
diff -puN fs/configfs/dir.c~filesystem-helpers-for-custom-struct-file-s fs/configfs/dir.c
--- lxc/fs/configfs/dir.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/fs/configfs/dir.c 2007-09-20 12:16:08.000000000 -0700
@@ -142,7 +142,7 @@ static int init_dir(struct inode * inode
return 0;
}
-static int init_file(struct inode * inode)
+static int configfs_init_file(struct inode * inode)
{
inode->i_size = PAGE_SIZE;
inode->i_fop = &configfs_file_operations;
@@ -283,7 +283,8 @@ static int configfs_attach_attr(struct c
dentry->d_fsdata = configfs_get(sd);
sd->s_dentry = dentry;
- error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+ error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
+ configfs_init_file);
if (error) {
configfs_put(sd);
return error;
diff -puN fs/file_table.c~filesystem-helpers-for-custom-struct-file-s fs/file_table.c
--- lxc/fs/file_table.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/fs/file_table.c 2007-09-20 12:16:08.000000000 -0700
@@ -138,6 +138,66 @@ fail:
EXPORT_SYMBOL(get_empty_filp);
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ * @mnt: the vfsmount on which the file will reside
+ * @dentry: the dentry representing the new file
+ * @mode: the mode with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ *
+ * Use this instead of get_empty_filp() to get a new
+ * 'struct file'. Do so because of the same initialization
+ * pitfalls reasons listed for init_file(). This is a
+ * preferred interface to using init_file().
+ *
+ * If all the callers of init_file() are eliminated, its
+ * code should be moved into this function.
+ */
+struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop)
+{
+ struct file *file;
+ struct path;
+
+ file = get_empty_filp();
+ if (!file)
+ return NULL;
+
+ init_file(file, mnt, dentry, mode, fop);
+ return file;
+}
+EXPORT_SYMBOL(alloc_file);
+
+/**
+ * init_file - initialize a 'struct file'
+ * @file: the already allocated 'struct file' to initialized
+ * @mnt: the vfsmount on which the file resides
+ * @dentry: the dentry representing this file
+ * @mode: the mode the file is opened with
+ * @fop: the 'struct file_operations' for this file
+ *
+ * Use this instead of setting the members directly. Doing so
+ * avoids making mistakes like forgetting the mntget() or
+ * forgetting to take a write on the mnt.
+ *
+ * Note: This is a crappy interface. It is here to make
+ * merging with the existing users of get_empty_filp()
+ * who have complex failure logic easier. All users
+ * of this should be moving to alloc_file().
+ */
+int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop)
+{
+ int error = 0;
+ file->f_path.dentry = dentry;
+ file->f_path.mnt = mntget(mnt);
+ file->f_mapping = dentry->d_inode->i_mapping;
+ file->f_mode = mode;
+ file->f_op = fop;
+ return error;
+}
+EXPORT_SYMBOL(init_file);
+
void fastcall fput(struct file *file)
{
if (atomic_dec_and_test(&file->f_count))
diff -puN fs/hugetlbfs/inode.c~filesystem-helpers-for-custom-struct-file-s fs/hugetlbfs/inode.c
--- lxc/fs/hugetlbfs/inode.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/fs/hugetlbfs/inode.c 2007-09-20 12:16:08.000000000 -0700
@@ -810,16 +810,11 @@ struct file *hugetlb_file_setup(const ch
if (!dentry)
goto out_shm_unlock;
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto out_dentry;
-
error = -ENOSPC;
inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
current->fsgid, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto out_file;
+ goto out_dentry;
error = -ENOMEM;
if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
@@ -828,17 +823,18 @@ struct file *hugetlb_file_setup(const ch
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0;
- file->f_path.mnt = mntget(hugetlbfs_vfsmount);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &hugetlbfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+
+ error = -ENFILE;
+ file = alloc_file(hugetlbfs_vfsmount, dentry,
+ FMODE_WRITE | FMODE_READ,
+ &hugetlbfs_file_operations);
+ if (!file)
+ goto out_inode;
+
return file;
out_inode:
iput(inode);
-out_file:
- put_filp(file);
out_dentry:
dput(dentry);
out_shm_unlock:
diff -puN include/linux/file.h~filesystem-helpers-for-custom-struct-file-s include/linux/file.h
--- lxc/include/linux/file.h~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/include/linux/file.h 2007-09-20 12:16:08.000000000 -0700
@@ -62,6 +62,15 @@ extern struct kmem_cache *filp_cachep;
extern void FASTCALL(__fput(struct file *));
extern void FASTCALL(fput(struct file *));
+struct file_operations;
+struct vfsmount;
+struct dentry;
+extern int init_file(struct file *, struct vfsmount *mnt,
+ struct dentry *dentry, mode_t mode,
+ const struct file_operations *fop);
+extern struct file *alloc_file(struct vfsmount *, struct dentry *dentry,
+ mode_t mode, const struct file_operations *fop);
+
static inline void fput_light(struct file *file, int fput_needed)
{
if (unlikely(fput_needed))
diff -puN ipc/shm.c~filesystem-helpers-for-custom-struct-file-s ipc/shm.c
--- lxc/ipc/shm.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/ipc/shm.c 2007-09-20 12:16:08.000000000 -0700
@@ -906,7 +906,7 @@ long do_shmat(int shmid, char __user *sh
goto out_unlock;
path.dentry = dget(shp->shm_file->f_path.dentry);
- path.mnt = mntget(shp->shm_file->f_path.mnt);
+ path.mnt = shp->shm_file->f_path.mnt;
shp->shm_nattch++;
size = i_size_read(path.dentry->d_inode);
shm_unlock(shp);
@@ -914,18 +914,16 @@ long do_shmat(int shmid, char __user *sh
err = -ENOMEM;
sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
if (!sfd)
- goto out_put_path;
+ goto out_put_dentry;
err = -ENOMEM;
- file = get_empty_filp();
+
+ file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
if (!file)
goto out_free;
- file->f_op = &shm_file_operations;
file->private_data = sfd;
- file->f_path = path;
file->f_mapping = shp->shm_file->f_mapping;
- file->f_mode = f_mode;
sfd->id = shp->id;
sfd->ns = get_ipc_ns(ns);
sfd->file = shp->shm_file;
@@ -976,9 +974,8 @@ out_unlock:
out_free:
kfree(sfd);
-out_put_path:
+out_put_dentry:
dput(path.dentry);
- mntput(path.mnt);
goto out_nattch;
}
diff -puN mm/shmem.c~filesystem-helpers-for-custom-struct-file-s mm/shmem.c
--- lxc/mm/shmem.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/mm/shmem.c 2007-09-20 12:16:08.000000000 -0700
@@ -2518,11 +2518,8 @@ struct file *shmem_file_setup(char *name
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &shmem_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &shmem_file_operations);
return file;
close_file:
diff -puN mm/tiny-shmem.c~filesystem-helpers-for-custom-struct-file-s mm/tiny-shmem.c
--- lxc/mm/tiny-shmem.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/mm/tiny-shmem.c 2007-09-20 12:16:08.000000000 -0700
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name
if (!dentry)
goto put_memory;
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto close_file;
+ goto put_dentry;
d_instantiate(dentry, inode);
- inode->i_nlink = 0; /* It is unlinked */
+ error = -ENFILE;
+ file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
+ if (!file)
+ goto put_dentry;
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &ramfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ inode->i_nlink = 0; /* It is unlinked */
/* notify everyone as to the change of file size */
error = do_truncate(dentry, size, 0, file);
diff -puN net/socket.c~filesystem-helpers-for-custom-struct-file-s net/socket.c
--- lxc/net/socket.c~filesystem-helpers-for-custom-struct-file-s 2007-09-20 12:16:08.000000000 -0700
+++ lxc-dave/net/socket.c 2007-09-20 12:16:08.000000000 -0700
@@ -363,26 +363,26 @@ static int sock_alloc_fd(struct file **f
static int sock_attach_fd(struct socket *sock, struct file *file)
{
+ struct dentry *dentry;
struct qstr name = { .name = "" };
- file->f_path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
- if (unlikely(!file->f_path.dentry))
+ dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
+ if (unlikely(!dentry))
return -ENOMEM;
- file->f_path.dentry->d_op = &sockfs_dentry_operations;
+ dentry->d_op = &sockfs_dentry_operations;
/*
* We dont want to push this dentry into global dentry hash table.
* We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
* This permits a working /proc/$pid/fd/XXX on sockets
*/
- file->f_path.dentry->d_flags &= ~DCACHE_UNHASHED;
- d_instantiate(file->f_path.dentry, SOCK_INODE(sock));
- file->f_path.mnt = mntget(sock_mnt);
- file->f_mapping = file->f_path.dentry->d_inode->i_mapping;
+ dentry->d_flags &= ~DCACHE_UNHASHED;
+ d_instantiate(dentry, SOCK_INODE(sock));
sock->file = file;
- file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
- file->f_mode = FMODE_READ | FMODE_WRITE;
+ init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
+ &socket_file_ops);
+ SOCK_INODE(sock)->i_fop = &socket_file_ops;
file->f_flags = O_RDWR;
file->f_pos = 0;
file->private_data = sock;
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 02/25] rearrange may_open() to be r/o friendly
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
2007-09-20 19:52 ` [PATCH 01/25] filesystem helpers for custom 'struct file's Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:52 ` [PATCH 03/25] give may_open() a local 'mnt' variable Dave Hansen
` (22 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
may_open() calls vfs_permission() before it does checks for
IS_RDONLY(inode). It checks _again_ inside of vfs_permission().
The check inside of vfs_permission() is going away eventually.
With the mnt_want/drop_write() functions, all of the r/o
checks (except for this one) are consistently done before
calling permission(). Because of this, I'd like to use
permission() to hold a debugging check to make sure that
the mnt_want/drop_write() calls are actually being made.
So, to do this:
1. remove the IS_RDONLY() check from permission()
2. enforce that you must mnt_want_write() before
even calling permission()
3. actually add the debugging check to permission()
We need to rearrange may_open() to do r/o checks
before calling permission(). Here's the patch.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff -puN fs/namei.c~rearrange-permission-and-ro-checks-in-may_open fs/namei.c
--- lxc/fs/namei.c~rearrange-permission-and-ro-checks-in-may_open 2007-09-20 12:16:09.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:09.000000000 -0700
@@ -1579,10 +1579,6 @@ int may_open(struct nameidata *nd, int a
if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
return -EISDIR;
- error = vfs_permission(nd, acc_mode);
- if (error)
- return error;
-
/*
* FIFO's, sockets and device files are special: they don't
* actually live on the filesystem itself, and as such you
@@ -1597,6 +1593,10 @@ int may_open(struct nameidata *nd, int a
flag &= ~O_TRUNC;
} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
return -EROFS;
+
+ error = vfs_permission(nd, acc_mode);
+ if (error)
+ return error;
/*
* An append-only file must be opened in append mode for writing.
*/
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 03/25] give may_open() a local 'mnt' variable
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
2007-09-20 19:52 ` [PATCH 01/25] filesystem helpers for custom 'struct file's Dave Hansen
2007-09-20 19:52 ` [PATCH 02/25] rearrange may_open() to be r/o friendly Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:57 ` Christoph Hellwig
2007-09-20 19:52 ` [PATCH 04/25] create cleanup helper svc_msnfs() Dave Hansen
` (21 subsequent siblings)
24 siblings, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
First of all, this makes the structure jumping look a little
bit cleaner. So, this stands alone as a tiny cleanup. But,
we also need 'mnt' by itself a few more times later in this
series, so this isn't _just_ a cleanup.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff -puN fs/namei.c~give-may_open-a-local-mnt-variable fs/namei.c
--- lxc/fs/namei.c~give-may_open-a-local-mnt-variable 2007-09-20 12:16:09.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:09.000000000 -0700
@@ -230,6 +230,10 @@ int permission(struct inode *inode, int
{
umode_t mode = inode->i_mode;
int retval, submask;
+ struct vfsmount *mnt = NULL;
+
+ if (nd)
+ mnt = nd->mnt;
if (mask & MAY_WRITE) {
@@ -254,7 +258,7 @@ int permission(struct inode *inode, int
* the fs is mounted with the "noexec" flag.
*/
if ((mask & MAY_EXEC) && S_ISREG(mode) && (!(mode & S_IXUGO) ||
- (nd && nd->mnt && (nd->mnt->mnt_flags & MNT_NOEXEC))))
+ (mnt && (mnt->mnt_flags & MNT_NOEXEC))))
return -EACCES;
/* Ordinary permission routines do not understand MAY_APPEND. */
_
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH 03/25] give may_open() a local 'mnt' variable
2007-09-20 19:52 ` [PATCH 03/25] give may_open() a local 'mnt' variable Dave Hansen
@ 2007-09-20 19:57 ` Christoph Hellwig
0 siblings, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2007-09-20 19:57 UTC (permalink / raw)
To: Dave Hansen; +Cc: akpm, linux-kernel, hch
On Thu, Sep 20, 2007 at 12:52:52PM -0700, Dave Hansen wrote:
>
> First of all, this makes the structure jumping look a little
> bit cleaner. So, this stands alone as a tiny cleanup. But,
> we also need 'mnt' by itself a few more times later in this
> series, so this isn't _just_ a cleanup.
You're touching permission, not may_open, though.
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH 04/25] create cleanup helper svc_msnfs()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (2 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 03/25] give may_open() a local 'mnt' variable Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:52 ` [PATCH 05/25] r/o bind mounts: stub functions Dave Hansen
` (20 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
I'm going to be modifying nfsd_rename() shortly to support
read-only bind mounts. This #ifdef is around the area I'm
patching, and it starts to get really ugly if I just try
to add my new code by itself. Using this little helper
makes things a lot cleaner to use.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/nfsd/vfs.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff -puN fs/nfsd/vfs.c~create-svc_msnfs-helper fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~create-svc_msnfs-helper 2007-09-20 12:16:10.000000000 -0700
+++ lxc-dave/fs/nfsd/vfs.c 2007-09-20 12:16:10.000000000 -0700
@@ -857,6 +857,15 @@ static int nfsd_direct_splice_actor(stru
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
+static inline int svc_msnfs(struct svc_fh *ffhp)
+{
+#ifdef MSNFS
+ return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
+#else
+ return 0;
+#endif
+}
+
static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -869,11 +878,9 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
err = nfserr_perm;
inode = file->f_path.dentry->d_inode;
-#ifdef MSNFS
- if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
- (!lock_may_read(inode, offset, *count)))
+
+ if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
goto out;
-#endif
/* Get readahead parameters */
ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 05/25] r/o bind mounts: stub functions
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (3 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 04/25] create cleanup helper svc_msnfs() Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:52 ` [PATCH 06/25] elevate write count open()'d files Dave Hansen
` (19 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
This patch adds two function mnt_want_write() and
mnt_drop_write(). These are used like a lock pair around
and fs operations that might cause a write to the filesystem.
Before these can become useful, we must first cover each
place in the VFS where writes are performed with a
want/drop pair. When that is complete, we can actually
introduce code that will safely check the counts before
allowing r/w<->r/o transitions to occur.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namespace.c | 54 +++++++++++++++++++++++++++++++++++++++++
lxc-dave/include/linux/mount.h | 3 ++
2 files changed, 57 insertions(+)
diff -puN fs/namespace.c~add-vfsmount-writer-count fs/namespace.c
--- lxc/fs/namespace.c~add-vfsmount-writer-count 2007-09-20 12:16:10.000000000 -0700
+++ lxc-dave/fs/namespace.c 2007-09-20 12:16:10.000000000 -0700
@@ -77,6 +77,60 @@ struct vfsmount *alloc_vfsmnt(const char
return mnt;
}
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success. When
+ * the write operation is finished, mnt_drop_write()
+ * must be called. This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+ if (__mnt_is_readonly(mnt))
+ return -EROFS;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it. Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*. This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
diff -puN include/linux/mount.h~add-vfsmount-writer-count include/linux/mount.h
--- lxc/include/linux/mount.h~add-vfsmount-writer-count 2007-09-20 12:16:10.000000000 -0700
+++ lxc-dave/include/linux/mount.h 2007-09-20 12:16:10.000000000 -0700
@@ -70,9 +70,12 @@ static inline struct vfsmount *mntget(st
return mnt;
}
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
+extern int __mnt_is_readonly(struct vfsmount *mnt);
static inline void mntput(struct vfsmount *mnt)
{
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 06/25] elevate write count open()'d files
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (4 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 05/25] r/o bind mounts: stub functions Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-11-28 8:41 ` Andrew Morton
2007-09-20 19:52 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Dave Hansen
` (18 subsequent siblings)
24 siblings, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
This is the first really tricky patch in the series. It
elevates the writer count on a mount each time a
non-special file is opened for write.
This is not completely apparent in the patch because the
two if() conditions in may_open() above the
mnt_want_write() call are, combined, equivalent to
special_file().
There is also an elevated count around the vfs_create()
call in open_namei(). The count needs to be kept elevated
all the way into the may_open() call. Otherwise, when the
write is dropped, a ro->rw transisition could occur. This
would lead to having rw access on the newly created file,
while the vfsmount is ro. That is bad.
Some filesystems forego the use of normal vfs calls to create
struct files. Make sure that these users elevate the mnt writer
count because they will get __fput(), and we need to make
sure they're balanced.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/file_table.c | 9 ++++++++-
lxc-dave/fs/namei.c | 20 ++++++++++++++++----
lxc-dave/ipc/mqueue.c | 3 +++
3 files changed, 27 insertions(+), 5 deletions(-)
diff -puN fs/file_table.c~tricky-elevate-write-count-files-are-open-ed fs/file_table.c
--- lxc/fs/file_table.c~tricky-elevate-write-count-files-are-open-ed 2007-09-20 12:16:11.000000000 -0700
+++ lxc-dave/fs/file_table.c 2007-09-20 12:16:11.000000000 -0700
@@ -194,6 +194,10 @@ int init_file(struct file *file, struct
file->f_mapping = dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
+ if (mode & FMODE_WRITE) {
+ error = mnt_want_write(mnt);
+ WARN_ON(error);
+ }
return error;
}
EXPORT_SYMBOL(init_file);
@@ -231,8 +235,11 @@ void fastcall __fput(struct file *file)
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
cdev_put(inode->i_cdev);
fops_put(file->f_op);
- if (file->f_mode & FMODE_WRITE)
+ if (file->f_mode & FMODE_WRITE) {
put_write_access(inode);
+ if (!special_file(inode->i_mode))
+ mnt_drop_write(mnt);
+ }
put_pid(file->f_owner.pid);
file_kill(file);
file->f_path.dentry = NULL;
diff -puN fs/namei.c~tricky-elevate-write-count-files-are-open-ed fs/namei.c
--- lxc/fs/namei.c~tricky-elevate-write-count-files-are-open-ed 2007-09-20 12:16:11.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:11.000000000 -0700
@@ -1595,8 +1595,15 @@ int may_open(struct nameidata *nd, int a
return -EACCES;
flag &= ~O_TRUNC;
- } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
- return -EROFS;
+ } else if (flag & FMODE_WRITE) {
+ /*
+ * effectively: !special_file()
+ * balanced by __fput()
+ */
+ error = mnt_want_write(nd->mnt);
+ if (error)
+ return error;
+ }
error = vfs_permission(nd, acc_mode);
if (error)
@@ -1739,14 +1746,17 @@ do_last:
}
if (IS_ERR(nd->intent.open.file)) {
- mutex_unlock(&dir->d_inode->i_mutex);
error = PTR_ERR(nd->intent.open.file);
- goto exit_dput;
+ goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
+ error = mnt_want_write(nd->mnt);
+ if (error)
+ goto exit_mutex_unlock;
error = open_namei_create(nd, &path, flag, mode);
+ mnt_drop_write(nd->mnt);
if (error)
goto exit;
return 0;
@@ -1784,6 +1794,8 @@ ok:
goto exit;
return 0;
+exit_mutex_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
dput_path(&path, nd);
exit:
diff -puN ipc/mqueue.c~tricky-elevate-write-count-files-are-open-ed ipc/mqueue.c
--- lxc/ipc/mqueue.c~tricky-elevate-write-count-files-are-open-ed 2007-09-20 12:16:11.000000000 -0700
+++ lxc-dave/ipc/mqueue.c 2007-09-20 12:16:11.000000000 -0700
@@ -686,6 +686,9 @@ asmlinkage long sys_mq_open(const char _
goto out;
filp = do_open(dentry, oflag);
} else {
+ error = mnt_want_write(mqueue_mnt);
+ if (error)
+ goto out;
filp = do_create(mqueue_mnt->mnt_root, dentry,
oflag, mode, u_attr);
}
_
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH 06/25] elevate write count open()'d files
2007-09-20 19:52 ` [PATCH 06/25] elevate write count open()'d files Dave Hansen
@ 2007-11-28 8:41 ` Andrew Morton
2007-11-28 17:33 ` Dave Hansen
0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-11-28 8:41 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Thu, 20 Sep 2007 12:52:56 -0700 Dave Hansen <haveblue@us.ibm.com> wrote:
> This is the first really tricky patch in the series. It
> elevates the writer count on a mount each time a
> non-special file is opened for write.
>
> This is not completely apparent in the patch because the
> two if() conditions in may_open() above the
> mnt_want_write() call are, combined, equivalent to
> special_file().
>
> There is also an elevated count around the vfs_create()
> call in open_namei(). The count needs to be kept elevated
> all the way into the may_open() call. Otherwise, when the
> write is dropped, a ro->rw transisition could occur. This
> would lead to having rw access on the newly created file,
> while the vfsmount is ro. That is bad.
>
> Some filesystems forego the use of normal vfs calls to create
> struct files. Make sure that these users elevate the mnt writer
> count because they will get __fput(), and we need to make
> sure they're balanced.
For some reason this has started oopsing on me:
[ 39.941273] audit(1196237507.111:5): audit_pid=1882 old=0 by auid=4294967295 subj=system_u:system_r:auditd_t:s0
[ 35.037235] BUG: unable to handle kernel NULL pointer dereference at virtual address 0000006a
[ 35.040536] printing eip: c017815c *pde = 00000000
[ 35.043913] Oops: 0000 [#1] PREEMPT
[ 35.047270] last sysfs file: /devices/platform/i8042/serio0/description
[ 35.050679] Modules linked in: ipv6 autofs4 hidp l2cap bluetooth sunrpc nf_conntrack_netbios_ns ipt_REJECT nf_conntrack_ipv4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables acpi_cpufreq nvram ohci1394 ieee1394 ehci_hcd uhci_hcd sg joydev snd_hda_intel snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm sr_mod snd_timer snd i2c_i801 ipw2200 cdrom ieee80211 pcspkr piix soundcore ieee80211_crypt i2c_core snd_page_alloc button generic ext3 jbd ide_disk ide_core
[ 35.071690]
[ 35.075612] Pid: 2277, comm: hald-addon-acpi Not tainted (2.6.24-rc3-mm2 #8)
[ 35.080517] EIP: 0060:[<c017815c>] EFLAGS: 00010296 CPU: 0
[ 35.084585] EIP is at open_pathname+0x37b/0x6a5
[ 35.088631] EAX: 00000000 EBX: fffffff0 ECX: f61fc000 EDX: c01a02c1
[ 35.092747] ESI: f61fdf20 EDI: 00000008 EBP: f61fdf84 ESP: f61fdefc
[ 35.098762] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
[ 35.102847] Process hald-addon-acpi (pid: 2277, ti=F61FC000 task=F61FEEB0 task.ti=F61FC000)
[ 35.103028] Stack: 00008000 f6589000 00000000 00000004 00000000 00008001 00000000 00000246
[ 35.107359] 0000023a f5a9f908 f780cbc0 00000000 00000000 00000000 00000101 00000001
[ 35.111671] 00000000 f61fdf5c 00000246 c016cdd9 00000246 f782b6a4 00000004 f782b6a4
[ 35.115978] Call Trace:
[ 35.124180] [<c0104a74>] show_trace_log_lvl+0x12/0x25
[ 35.128422] [<c0104b13>] show_stack_log_lvl+0x8c/0x97
[ 35.132646] [<c0104ba8>] show_registers+0x8a/0x1c0
[ 35.136900] [<c0104dcc>] die+0xee/0x1c4
[ 35.141165] [<c011683c>] do_page_fault+0x405/0x4e1
[ 35.145486] [<c031db7a>] error_code+0x6a/0x70
[ 35.149672] [<c016e02b>] do_sys_open+0x42/0xb8
[ 35.153714] [<c016e0cd>] sys_open+0x16/0x18
[ 35.157763] [<c0103b2a>] syscall_call+0x7/0xb
[ 35.161831] =======================
[ 35.165899] Code: 7d 80 00 0f 84 a9 00 00 00 8b 45 a0 e8 9e 94 00 00 e9 9c 00 00 00 8b 95 78 ff ff ff 89 f0 e8 23 4f ff ff 89 c3 8b 45 9c 8b 40 28 <0f> b7 40 6a 25 00 f0 00 00 3d 00 20 00 00 0f 84 0c 03 00 00 3d
[ 35.175160] EIP: [<c017815c>] open_pathname+0x37b/0x6a5 SS:ESP 0068:f61fdefc
I debugged it a bit. hald-addon-acpi is opening /proc/acpi/event and we're
getting here:
error = may_open(&nd, acc_mode, flag);
if (error) {
if (open_will_write_to_fs(flag, nd.dentry->d_inode))
mnt_drop_write(nd.mnt);
goto exit;
}
filp = nameidata_to_filp(&nd, sys_open_flag);
/*
* It is now safe to drop the mnt write
* because the filp has had a write taken
* on its behalf.
*/
if (open_will_write_to_fs(flag, nd.dentry->d_inode))
mnt_drop_write(nd.mnt);
return filp;
the final call to open_will_write_to_fs() is oopsing over null
nd.dentry->d_inode.
Given that nameidata_to_filp() can call path_release() which "destroys the
original nameidata", it looks like this was always buggy?
This:
--- a/fs/namei.c~b
+++ a/fs/namei.c
@@ -1722,7 +1722,7 @@ static inline int sys_open_flags_to_name
return flag;
}
-static inline int open_will_write_to_fs(int flag, struct inode *inode)
+static int open_will_write_to_fs(int flag, struct inode *inode)
{
/*
* We'll never write to the fs underlying
@@ -1752,6 +1752,7 @@ struct file *open_pathname(int dfd, cons
struct path path;
struct dentry *dir;
int count = 0;
+ int will_write;
int flag = sys_open_flags_to_namei_flags(sys_open_flag);
acc_mode = ACC_MODE(flag);
@@ -1870,14 +1871,15 @@ ok:
* be avoided. Taking this mnt write here
* ensures that (2) can not occur.
*/
- if (open_will_write_to_fs(flag, nd.dentry->d_inode)) {
+ will_write = open_will_write_to_fs(flag, nd.dentry->d_inode);
+ if (will_write) {
error = mnt_want_write(nd.mnt);
if (error)
goto exit;
}
error = may_open(&nd, acc_mode, flag);
if (error) {
- if (open_will_write_to_fs(flag, nd.dentry->d_inode))
+ if (will_write)
mnt_drop_write(nd.mnt);
goto exit;
}
@@ -1887,7 +1889,7 @@ ok:
* because the filp has had a write taken
* on its behalf.
*/
- if (open_will_write_to_fs(flag, nd.dentry->d_inode))
+ if (will_write)
mnt_drop_write(nd.mnt);
return filp;
_
seems a fairly obvious fix and is more efficient anyway.
I hate this patch series.
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (5 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 06/25] elevate write count open()'d files Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-21 8:17 ` Andrew Morton
2007-09-21 23:03 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Andrew Morton
2007-09-20 19:52 ` [PATCH 08/25] elevate writer count for chown and friends Dave Hansen
` (17 subsequent siblings)
24 siblings, 2 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Some ioctl()s can cause writes to the filesystem. Take
these, and make them use mnt_want/drop_write() instead.
We need to pass the filp one layer deeper in XFS, but
somebody _just_ pulled it out in February because nobody
was using it, so I don't feel guilty for adding it back.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/ext2/ioctl.c | 46 +++++++++-----
lxc-dave/fs/ext3/ioctl.c | 100 +++++++++++++++++++++-----------
lxc-dave/fs/ext4/ioctl.c | 105 +++++++++++++++++++++-------------
lxc-dave/fs/fat/file.c | 10 +--
lxc-dave/fs/hfsplus/ioctl.c | 40 ++++++++----
lxc-dave/fs/jfs/ioctl.c | 33 ++++++----
lxc-dave/fs/ocfs2/ioctl.c | 11 +--
lxc-dave/fs/reiserfs/ioctl.c | 55 +++++++++++------
lxc-dave/fs/xfs/linux-2.6/xfs_ioctl.c | 15 +++-
lxc-dave/fs/xfs/linux-2.6/xfs_iops.c | 7 --
lxc-dave/fs/xfs/linux-2.6/xfs_lrw.c | 9 ++
11 files changed, 274 insertions(+), 157 deletions(-)
diff -puN fs/ext2/ioctl.c~ioctl-mnt-takers fs/ext2/ioctl.c
--- lxc/fs/ext2/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/ext2/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -12,6 +12,7 @@
#include <linux/time.h>
#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include <linux/smp_lock.h>
#include <asm/current.h>
#include <asm/uaccess.h>
@@ -20,6 +21,7 @@
int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned long arg)
{
+ int ret;
struct ext2_inode_info *ei = EXT2_I(inode);
unsigned int flags;
@@ -33,14 +35,19 @@ int ext2_ioctl (struct inode * inode, st
case EXT2_IOC_SETFLAGS: {
unsigned int oldflags;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ ret = mnt_want_write(filp->f_vfsmnt);
+ if (ret)
+ return ret;
+
+ if (!is_owner_or_cap(inode)) {
+ ret = -EACCES;
+ goto setflags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto setflags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT2_DIRSYNC_FL;
@@ -57,7 +64,8 @@ int ext2_ioctl (struct inode * inode, st
if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ ret = -EPERM;
+ goto setflags_out;
}
}
@@ -69,20 +77,26 @@ int ext2_ioctl (struct inode * inode, st
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return ret;
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
case EXT2_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *) arg))
- return -EFAULT;
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- return 0;
+ ret = mnt_want_write(filp->f_vfsmnt);
+ if (ret)
+ return ret;
+ if (get_user(inode->i_generation, (int __user *) arg)) {
+ ret = -EFAULT;
+ } else {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ }
+ mnt_drop_write(filp->f_vfsmnt);
+ return ret;
default:
return -ENOTTY;
}
diff -puN fs/ext3/ioctl.c~ioctl-mnt-takers fs/ext3/ioctl.c
--- lxc/fs/ext3/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/ext3/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, st
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto flags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto flags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT3_DIRSYNC_FL;
@@ -65,7 +71,8 @@ int ext3_ioctl (struct inode * inode, st
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -76,7 +83,8 @@ int ext3_ioctl (struct inode * inode, st
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -84,7 +92,8 @@ int ext3_ioctl (struct inode * inode, st
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -110,6 +119,8 @@ flags_err:
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
mutex_unlock(&inode->i_mutex);
+flags_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -124,14 +135,18 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = CURRENT_TIME_SEC;
@@ -139,6 +154,8 @@ flags_err:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
#ifdef CONFIG_JBD_DEBUG
@@ -174,18 +191,24 @@ flags_err:
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setrsvsz_out;
+ }
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
+ if (get_user(rsv_window_size, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setrsvsz_out;
+ }
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -203,7 +226,9 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
- return 0;
+setrsvsz_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
}
case EXT3_IOC_GROUP_EXTEND: {
ext3_fsblk_t n_blocks_count;
@@ -213,17 +238,20 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_extend_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
case EXT3_IOC_GROUP_ADD: {
@@ -234,18 +262,22 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
err = ext3_group_add(sb, &input);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_add_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
diff -puN fs/ext4/ioctl.c~ioctl-mnt-takers fs/ext4/ioctl.c
--- lxc/fs/ext4/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/ext4/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/ext4_fs.h>
#include <linux/ext4_jbd2.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
@@ -38,15 +39,19 @@ int ext4_ioctl (struct inode * inode, st
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto flags_out;
+ }
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto flags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT4_DIRSYNC_FL;
@@ -65,7 +70,8 @@ int ext4_ioctl (struct inode * inode, st
if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -76,7 +82,8 @@ int ext4_ioctl (struct inode * inode, st
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -84,7 +91,8 @@ int ext4_ioctl (struct inode * inode, st
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -104,12 +112,14 @@ flags_err:
ext4_journal_stop(handle);
if (err) {
mutex_unlock(&inode->i_mutex);
- return err;
+ goto flags_out;
}
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
mutex_unlock(&inode->i_mutex);
+flags_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -124,14 +134,18 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext4_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = ext4_current_time(inode);
@@ -139,6 +153,8 @@ flags_err:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
#ifdef CONFIG_JBD2_DEBUG
@@ -174,19 +190,23 @@ flags_err:
}
return -ENOTTY;
case EXT4_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setrsvsz_out;
+ }
+ if (get_user(rsv_window_size, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setrsvsz_out;
+ }
if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
@@ -203,7 +223,9 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
- return 0;
+setrsvsz_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
}
case EXT4_IOC_GROUP_EXTEND: {
ext4_fsblk_t n_blocks_count;
@@ -213,17 +235,21 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+group_extend_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
case EXT4_IOC_GROUP_ADD: {
@@ -234,18 +260,21 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
-
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
err = ext4_group_add(sb, &input);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-
+group_add_out:
+ mnt_drop_write(filp->f_vfsmnt);
return err;
}
diff -puN fs/fat/file.c~ioctl-mnt-takers fs/fat/file.c
--- lxc/fs/fat/file.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/fat/file.c 2007-09-20 12:16:12.000000000 -0700
@@ -8,6 +8,7 @@
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
#include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inod
mutex_lock(&inode->i_mutex);
- if (IS_RDONLY(inode)) {
- err = -EROFS;
- goto up;
- }
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ goto up_no_drop_write;
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -106,6 +106,8 @@ int fat_generic_ioctl(struct inode *inod
MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
mark_inode_dirty(inode);
up:
+ mnt_drop_write(filp->f_vfsmnt);
+ up_no_drop_write:
mutex_unlock(&inode->i_mutex);
return err;
}
diff -puN fs/hfsplus/ioctl.c~ioctl-mnt-takers fs/hfsplus/ioctl.c
--- lxc/fs/hfsplus/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/hfsplus/ioctl.c 2007-09-20 12:31:43.000000000 -0700
@@ -14,6 +14,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/xattr.h>
#include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, s
flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
return put_user(flags, (int __user *)arg);
case HFSPLUS_IOC_EXT2_SETFLAGS: {
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
+ int err = 0;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
}
/* don't silently ignore unsupported ext2 flags */
- if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
- return -EOPNOTSUPP;
-
+ if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+ err = -EOPNOTSUPP;
+ goto setflags_out;
+ }
if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
inode->i_flags |= S_IMMUTABLE;
HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, s
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
}
default:
return -ENOTTY;
diff -puN fs/jfs/ioctl.c~ioctl-mnt-takers fs/jfs/ioctl.c
--- lxc/fs/jfs/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/jfs/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/sched.h>
#include <asm/current.h>
@@ -65,16 +66,20 @@ int jfs_ioctl(struct inode * inode, stru
return put_user(flags, (int __user *) arg);
case JFS_IOC_SETFLAGS: {
unsigned int oldflags;
+ int err;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
flags = jfs_map_ext2(flags, 1);
if (!S_ISDIR(inode->i_mode))
flags &= ~JFS_DIRSYNC_FL;
@@ -89,8 +94,10 @@ int jfs_ioctl(struct inode * inode, stru
if ((oldflags & JFS_IMMUTABLE_FL) ||
((flags ^ oldflags) &
(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
}
flags = flags & JFS_FL_USER_MODIFIABLE;
@@ -100,7 +107,9 @@ int jfs_ioctl(struct inode * inode, stru
jfs_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
}
default:
return -ENOTTY;
diff -puN fs/ocfs2/ioctl.c~ioctl-mnt-takers fs/ocfs2/ioctl.c
--- lxc/fs/ocfs2/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/ocfs2/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -58,10 +58,6 @@ static int ocfs2_set_inode_attr(struct i
goto bail;
}
- status = -EROFS;
- if (IS_RDONLY(inode))
- goto bail_unlock;
-
status = -EACCES;
if (!is_owner_or_cap(inode))
goto bail_unlock;
@@ -130,8 +126,13 @@ int ocfs2_ioctl(struct inode * inode, st
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- return ocfs2_set_inode_attr(inode, flags,
+ status = mnt_want_write(filp->f_vfsmnt);
+ if (status)
+ return status;
+ status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
+ mnt_drop_write(filp->f_vfsmnt);
+ return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
diff -puN fs/reiserfs/ioctl.c~ioctl-mnt-takers fs/reiserfs/ioctl.c
--- lxc/fs/reiserfs/ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/reiserfs/ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -4,6 +4,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/reiserfs_fs.h>
#include <linux/time.h>
#include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode,
unsigned long arg)
{
unsigned int flags;
+ int err = 0;
switch (cmd) {
case REISERFS_IOC_UNPACK:
@@ -48,47 +50,60 @@ int reiserfs_ioctl(struct inode *inode,
if (!reiserfs_attrs(inode->i_sb))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+
+ if (!is_owner_or_cap(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
if (((flags ^ REISERFS_I(inode)->
i_attrs) & (REISERFS_IMMUTABLE_FL |
REISERFS_APPEND_FL))
- && !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
+ && !capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
if ((flags & REISERFS_NOTAIL_FL) &&
S_ISREG(inode->i_mode)) {
int result;
result = reiserfs_unpack(inode, filp);
- if (result)
- return result;
+ if (result) {
+ err = result;
+ goto setflags_out;
+ }
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
}
case REISERFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
case REISERFS_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_vfsmnt);
+ if (err)
+ return err;
+ if (get_user(inode->i_generation, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setversion_out:
+ mnt_drop_write(filp->f_vfsmnt);
+ return err;
default:
return -ENOTTY;
}
diff -puN fs/xfs/linux-2.6/xfs_ioctl.c~ioctl-mnt-takers fs/xfs/linux-2.6/xfs_ioctl.c
--- lxc/fs/xfs/linux-2.6/xfs_ioctl.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/xfs/linux-2.6/xfs_ioctl.c 2007-09-20 12:16:12.000000000 -0700
@@ -526,8 +526,6 @@ xfs_attrmulti_attr_set(
char *kbuf;
int error = EFAULT;
- if (IS_RDONLY(&vp->v_inode))
- return -EROFS;
if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
return EPERM;
if (len > XATTR_SIZE_MAX)
@@ -553,8 +551,6 @@ xfs_attrmulti_attr_remove(
char *name,
__uint32_t flags)
{
- if (IS_RDONLY(&vp->v_inode))
- return -EROFS;
if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
return EPERM;
return bhv_vop_attr_remove(vp, name, flags, NULL);
@@ -564,6 +560,7 @@ STATIC int
xfs_attrmulti_by_handle(
xfs_mount_t *mp,
void __user *arg,
+ struct file *parfilp,
struct inode *parinode)
{
int error;
@@ -618,13 +615,21 @@ xfs_attrmulti_by_handle(
&ops[i].am_length, ops[i].am_flags);
break;
case ATTR_OP_SET:
+ ops[i].am_error = mnt_want_write(parfilp->f_vfsmnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_set(vp,
attr_name, ops[i].am_attrvalue,
ops[i].am_length, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_vfsmnt);
break;
case ATTR_OP_REMOVE:
+ ops[i].am_error = mnt_want_write(parfilp->f_vfsmnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_remove(vp,
attr_name, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_vfsmnt);
break;
default:
ops[i].am_error = EINVAL;
@@ -804,7 +809,7 @@ xfs_ioctl(
return xfs_attrlist_by_handle(mp, arg, inode);
case XFS_IOC_ATTRMULTI_BY_HANDLE:
- return xfs_attrmulti_by_handle(mp, arg, inode);
+ return xfs_attrmulti_by_handle(mp, arg, filp, inode);
case XFS_IOC_SWAPEXT: {
error = xfs_swapext((struct xfs_swapext __user *)arg);
diff -puN fs/xfs/linux-2.6/xfs_iops.c~ioctl-mnt-takers fs/xfs/linux-2.6/xfs_iops.c
--- lxc/fs/xfs/linux-2.6/xfs_iops.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/xfs/linux-2.6/xfs_iops.c 2007-09-20 12:16:12.000000000 -0700
@@ -156,13 +156,6 @@ xfs_ichgtime_fast(
*/
ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
- /*
- * We're not supposed to change timestamps in readonly-mounted
- * filesystems. Throw it away if anyone asks us.
- */
- if (unlikely(IS_RDONLY(inode)))
- return;
-
if (flags & XFS_ICHGTIME_MOD) {
tvp = &inode->i_mtime;
ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
diff -puN fs/xfs/linux-2.6/xfs_lrw.c~ioctl-mnt-takers fs/xfs/linux-2.6/xfs_lrw.c
--- lxc/fs/xfs/linux-2.6/xfs_lrw.c~ioctl-mnt-takers 2007-09-20 12:16:12.000000000 -0700
+++ lxc-dave/fs/xfs/linux-2.6/xfs_lrw.c 2007-09-20 12:16:12.000000000 -0700
@@ -50,6 +50,7 @@
#include "xfs_iomap.h"
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/writeback.h>
@@ -717,10 +718,16 @@ start:
if (new_size > xip->i_size)
io->io_new_size = new_size;
- if (likely(!(ioflags & IO_INVIS))) {
+ /*
+ * We're not supposed to change timestamps in readonly-mounted
+ * filesystems. Throw it away if anyone asks us.
+ */
+ if (likely(!(ioflags & IO_INVIS) &&
+ !mnt_want_write(file->f_vfsmnt))) {
file_update_time(file);
xfs_ichgtime_fast(xip, inode,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ mnt_drop_write(file->f_vfsmnt);
}
/*
_
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-20 19:52 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Dave Hansen
@ 2007-09-21 8:17 ` Andrew Morton
2007-09-21 21:15 ` Dave Hansen
2007-09-26 1:34 ` [RFC] detect missed mnt_want_write() calls Dave Hansen
2007-09-21 23:03 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Andrew Morton
1 sibling, 2 replies; 48+ messages in thread
From: Andrew Morton @ 2007-09-21 8:17 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Thu, 20 Sep 2007 12:52:57 -0700 Dave Hansen <haveblue@us.ibm.com> wrote:
> + ret = mnt_want_write(filp->f_vfsmnt);
It still creeps me out that we have this sprinkled *all over* the tree and
people will forget to do it and there's no runtime or compile-time checking
that they remembered to do it and when they forget to do it nobody will
notice that it broke until ages and ages later.
IOW: it is a sheer horror for maintainability.
Please have a think about what we can do about this. For example, if you'd
thought about this up-front, (and I think it's a big problem), we could
have done something grotty like, in mnt_want_write():
current->vfsmnt_im_allowed_to_write_to = inode;
and then check that current->vfsmnt_im_allowed_to_write_to is the correct
inode in __mark_inode_dirty() and various other strategic places. That
sort of thing.
We need to do *something*, I think. This code just doesn't look feasibly
maintainable to me.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-21 8:17 ` Andrew Morton
@ 2007-09-21 21:15 ` Dave Hansen
2007-09-26 1:34 ` [RFC] detect missed mnt_want_write() calls Dave Hansen
1 sibling, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-21 21:15 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch, viro
On Fri, 2007-09-21 at 01:17 -0700, Andrew Morton wrote:
> On Thu, 20 Sep 2007 12:52:57 -0700 Dave Hansen <haveblue@us.ibm.com> wrote:
>
> > + ret = mnt_want_write(filp->f_vfsmnt);
>
> It still creeps me out that we have this sprinkled *all over* the tree and
> people will forget to do it and there's no runtime or compile-time checking
> that they remembered to do it and when they forget to do it nobody will
> notice that it broke until ages and ages later.
>
> IOW: it is a sheer horror for maintainability.
>
>
> Please have a think about what we can do about this. For example, if you'd
> thought about this up-front, (and I think it's a big problem), we could
> have done something grotty like, in mnt_want_write():
>
> current->vfsmnt_im_allowed_to_write_to = inode;
>
> and then check that current->vfsmnt_im_allowed_to_write_to is the correct
> inode in __mark_inode_dirty() and various other strategic places. That
> sort of thing.
Just brainstorming a bit... I tried just keeping a writer count in the
superblock which we bump at the same time as the mnt->__mnt_writers,
which makes it quite easy to check whenever we have the inode around.
This causes quite a few false positives with journal code, and I think a
few more with direct block device access. I think we can work around
those, though.
The true issue comes when we have a lot of users (more than one) of a
superblock. If anybody has a write request outstanding, then the check
I put in will get skipped. It's easy to mask real problems this way.
Putting the "inodes allowed" list in the task (or probably signal struct
to support threads) should be workable, but I worry a bit about the
cases where fds are sent across sockets.
We could also break up the mnt_want_write() requests into two classes,
the common mnt_want_write() case and mnt_want_indefinite_write() or
something. The plain (and common) case is the one for things like
rmdir, mknod, or chmod where the write is confined to a single syscall.
mnt_want_indefinite_write() would be for things that do an open() and
later work on a file descriptor. Each of these cases could have a
different variable or flag in (or hanging off) the task struct.
In __mark_inode_dirty() we would first check to see if we are currently
in one of the plain, more atomic, mnt_want_write() regions by checking a
flag. If not, we go looking through the task's file descriptors and
seeing if one of them is open for write on the inode. If both of these
conditions fail, then we have a potential bug.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* [RFC] detect missed mnt_want_write() calls
2007-09-21 8:17 ` Andrew Morton
2007-09-21 21:15 ` Dave Hansen
@ 2007-09-26 1:34 ` Dave Hansen
1 sibling, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-26 1:34 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch, viro
On Fri, 2007-09-21 at 01:17 -0700, Andrew Morton wrote:
> On Thu, 20 Sep 2007 12:52:57 -0700 Dave Hansen <haveblue@us.ibm.com> wrote:
> > + ret = mnt_want_write(filp->f_vfsmnt);
>
> It still creeps me out that we have this sprinkled *all over* the tree and
> people will forget to do it and there's no runtime or compile-time checking
> that they remembered to do it and when they forget to do it nobody will
> notice that it broke until ages and ages later.
>
> IOW: it is a sheer horror for maintainability.
>
>
> Please have a think about what we can do about this. For example, if you'd
> thought about this up-front, (and I think it's a big problem), we could
> have done something grotty like, in mnt_want_write():
>
> current->vfsmnt_im_allowed_to_write_to = inode;
>
> and then check that current->vfsmnt_im_allowed_to_write_to is the correct
> inode in __mark_inode_dirty() and various other strategic places. That
> sort of thing.
>
>
> We need to do *something*, I think. This code just doesn't look feasibly
> maintainable to me.
We definitely can't use 'current'-based counts for everything. For
example, we might take a write on a mnt at a time when a loopback device
is created. We'll release that write when the device is destroyed, and
these certainly won't always be the same task, or even process.
However, there are preciously few places in the code that actually have
to deal with these long-lived, persistent mount writer counts. Those
are basically where we're creating a 'struct file' and keeping it around
for a while.
The rest of the calls (95% or so) are much more atomic. They are much
more like a simple lock/unlock pair, and their references will never
persist beyond a single system call. These are also very rarely nested
and certainly never deeply nested.
For the "atomic" write counts, I keep track of the mnts on which a
particular task may write. This is a simple array in the task_struct.
I currently check this array on all task_struct free()s to make sure it
is empty. We could also do this at all system call exits, but this is
probably unnecessary because that would almost certainly be a
mnt_want_write() leak.
For the persistent users, we actually change the API to be
mnt_want_persistent_write(mnt, inode) to track the mnt and the inode
pair, and we hang the list of these off the superblock. We could
probably do this universally, but the number of persistent users is
quite small, which reduces the code churn required. This part of the
patch is not very scalable, so we'd have to rethink this if we want this
functionality for more than debugging.
Did you have this in mind as being a debugging option that we turn on
when we see problems like LOCKDEP, or were you thinking of something
that stays on all the time? This turned out to be more code than I had
hoped, so I'm very open to suggestions for other approaches.
We also need a thorough review to look for things like
ext3_mark_inode_dirty() since they never actually call down to the
generic vfs mark_inode_dirty().
This is strictly a RFC for now.
---
lxc-dave/fs/buffer.c | 10 ++
lxc-dave/fs/ext2/super.c | 2
lxc-dave/fs/ext3/balloc.c | 6 +
lxc-dave/fs/ext3/inode.c | 2
lxc-dave/fs/ext3/super.c | 9 ++
lxc-dave/fs/ext4/inode.c | 1
lxc-dave/fs/file_table.c | 4 -
lxc-dave/fs/fs-writeback.c | 27 ++++++
lxc-dave/fs/inode.c | 17 +++-
lxc-dave/fs/jbd/journal.c | 2
lxc-dave/fs/jbd/recovery.c | 2
lxc-dave/fs/jbd/transaction.c | 2
lxc-dave/fs/namei.c | 11 ++
lxc-dave/fs/namespace.c | 136 ++++++++++++++++++++++++++++++++++-
lxc-dave/fs/ocfs2/inode.c | 1
lxc-dave/fs/super.c | 2
lxc-dave/include/linux/buffer_head.h | 1
lxc-dave/include/linux/file.h | 1
lxc-dave/include/linux/fs.h | 2
lxc-dave/include/linux/mount.h | 17 ++++
lxc-dave/include/linux/sched.h | 2
lxc-dave/kernel/fork.c | 31 +++++++
22 files changed, 274 insertions(+), 14 deletions(-)
diff -puN fs/buffer.c~debug-missed-mnt_want_writes fs/buffer.c
--- lxc/fs/buffer.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/buffer.c 2007-09-25 15:27:43.000000000 -0700
@@ -1172,6 +1172,16 @@ __getblk_slow(struct block_device *bdev,
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
* mapping->tree_lock and the global inode_lock.
*/
+void fastcall mark_buffer_dirty_nocheck(struct buffer_head *bh)
+{
+ struct i_mnt_pair pair;
+ pair.inode = bh->b_page->mapping->host;
+ pair.mnt = NULL;
+ allow_writes_via_mnt_to(&pair);
+ mark_buffer_dirty(bh);
+ disallow_writes_via_mnt_to(pair.mnt, pair.inode, pair.inode->i_sb);
+}
+
void fastcall mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));
diff -puN fs/ext2/super.c~debug-missed-mnt_want_writes fs/ext2/super.c
--- lxc/fs/ext2/super.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/ext2/super.c 2007-09-25 15:27:43.000000000 -0700
@@ -1372,7 +1372,7 @@ out:
#endif
-static struct file_system_type ext2_fs_type = {
+struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
.get_sb = ext2_get_sb,
diff -puN fs/ext3/balloc.c~debug-missed-mnt_want_writes fs/ext3/balloc.c
--- lxc/fs/ext3/balloc.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/ext3/balloc.c 2007-09-25 15:27:43.000000000 -0700
@@ -466,6 +466,12 @@ void ext3_free_blocks_sb(handle_t *handl
int err = 0, ret;
ext3_grpblk_t group_freed;
+ /*
+ * this is a workaround for warnings for now
+ * to do this properly, we need to take a
+ * mnt_want_write() when i_nlink goes to 0
+ * and drop it after iput_final();
+ */
*pdquot_freed_blocks = 0;
sbi = EXT3_SB(sb);
es = sbi->s_es;
diff -puN fs/ext3/inode.c~debug-missed-mnt_want_writes fs/ext3/inode.c
--- lxc/fs/ext3/inode.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/ext3/inode.c 2007-09-25 15:55:56.000000000 -0700
@@ -1228,7 +1228,6 @@ static int ext3_generic_write_end(struct
struct page *page, void *fsdata)
{
struct inode *inode = file->f_mapping->host;
-
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
if (pos+copied > inode->i_size) {
@@ -3173,6 +3172,7 @@ int ext3_mark_inode_dirty(handle_t *hand
int err;
might_sleep();
+ check_write_ability_to_inode(inode);
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (!err)
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
diff -puN fs/ext3/super.c~debug-missed-mnt_want_writes fs/ext3/super.c
--- lxc/fs/ext3/super.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/ext3/super.c 2007-09-25 15:27:43.000000000 -0700
@@ -1803,7 +1803,9 @@ static int ext3_fill_super (struct super
*/
if (!test_opt(sb, NOLOAD) &&
EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
- if (ext3_load_journal(sb, es, journal_devnum))
+ int jlret;
+ jlret = ext3_load_journal(sb, es, journal_devnum);
+ if (jlret)
goto failed_mount3;
} else if (journal_inum) {
if (ext3_create_journal(sb, es, journal_inum))
@@ -2218,6 +2220,11 @@ static void ext3_commit_super (struct su
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
+ /*
+ * during a mount operation, it should be OK to be writing
+ * to the superblock for things like dirty mount bits, even
+ * if no one else has permissions to write to it.
+ */
mark_buffer_dirty(sbh);
if (sync)
sync_dirty_buffer(sbh);
diff -puN fs/file_table.c~debug-missed-mnt_want_writes fs/file_table.c
--- lxc/fs/file_table.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/file_table.c 2007-09-25 15:27:43.000000000 -0700
@@ -194,7 +194,7 @@ int init_file(struct file *file, struct
file->f_mode = mode;
file->f_op = fop;
if (mode & FMODE_WRITE) {
- error = mnt_want_write(mnt);
+ error = mnt_want_persistent_write(mnt, dentry->d_inode);
WARN_ON(error);
}
return error;
@@ -237,7 +237,7 @@ void fastcall __fput(struct file *file)
if (file->f_mode & FMODE_WRITE) {
put_write_access(inode);
if (!special_file(inode->i_mode))
- mnt_drop_write(mnt);
+ mnt_drop_persistent_write(mnt, inode);
}
put_pid(file->f_owner.pid);
file_kill(file);
diff -puN fs/fs-writeback.c~debug-missed-mnt_want_writes fs/fs-writeback.c
--- lxc/fs/fs-writeback.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/fs-writeback.c 2007-09-25 16:05:17.000000000 -0700
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/writeback.h>
@@ -76,6 +77,31 @@ static void __check_dirty_inode_list(str
__FILE__, __LINE__); \
} while (0)
+void check_write_ability_to_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ int i;
+ /*
+ * You do not have to have write access to a mnt to write
+ * to a device file or something like a named pipe.
+ */
+ if (special_file(inode->i_mode))
+ return;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ struct vfsmount *mnt = current->can_write_via_mnt[i];
+ if (!mnt)
+ continue;
+ if (sb == mnt->mnt_sb)
+ break;
+ }
+ /*
+ * found a matching mount in can_write_via_mnt[]
+ */
+ if (i < CAN_WRITE_VIA_LEN)
+ return;
+ check_sb_for_writable_inode(sb, inode);
+}
+
/**
* __mark_inode_dirty - internal function
* @inode: inode to mark
@@ -106,6 +132,7 @@ static void __check_dirty_inode_list(str
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
+ check_write_ability_to_inode(inode);
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
diff -puN fs/inode.c~debug-missed-mnt_want_writes fs/inode.c
--- lxc/fs/inode.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/inode.c 2007-09-25 15:27:43.000000000 -0700
@@ -1137,15 +1137,28 @@ static inline void iput_final(struct ino
void iput(struct inode *inode)
{
if (inode) {
- const struct super_operations *op = inode->i_sb->s_op;
+ struct super_block *sb = inode->i_sb;
+ const struct super_operations *op = sb->s_op;
BUG_ON(inode->i_state == I_CLEAR);
if (op && op->put_inode)
op->put_inode(inode);
- if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) {
+ /*
+ * this is a workaround for warnings for now
+ * to do this properly, we need to take a
+ * mnt_want_write() when i_nlink goes to 0
+ * and drop it here, after iput_final().
+ */
+ struct i_mnt_pair pair;
+ pair.inode = inode;
+ pair.mnt = NULL;
+ allow_writes_via_mnt_to(&pair);
iput_final(inode);
+ disallow_writes_via_mnt_to(NULL, inode, sb);
+ }
}
}
diff -puN fs/jbd/journal.c~debug-missed-mnt_want_writes fs/jbd/journal.c
--- lxc/fs/jbd/journal.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/jbd/journal.c 2007-09-25 15:27:43.000000000 -0700
@@ -957,7 +957,7 @@ void journal_update_superblock(journal_t
spin_unlock(&journal->j_state_lock);
BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
+ mark_buffer_dirty_nocheck(bh);
if (wait)
sync_dirty_buffer(bh);
else
diff -puN fs/jbd/recovery.c~debug-missed-mnt_want_writes fs/jbd/recovery.c
--- lxc/fs/jbd/recovery.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/jbd/recovery.c 2007-09-25 15:27:43.000000000 -0700
@@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journa
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
+ mark_buffer_dirty_nocheck(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
diff -puN fs/jbd/transaction.c~debug-missed-mnt_want_writes fs/jbd/transaction.c
--- lxc/fs/jbd/transaction.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/jbd/transaction.c 2007-09-25 15:27:43.000000000 -0700
@@ -1548,7 +1548,7 @@ static void __journal_temp_unlink_buffer
__blist_del_buffer(list, jh);
jh->b_jlist = BJ_None;
if (test_clear_buffer_jbddirty(bh))
- mark_buffer_dirty(bh); /* Expose it to the VM */
+ mark_buffer_dirty_nocheck(bh); /* Expose it to the VM */
}
void __journal_unfile_buffer(struct journal_head *jh)
diff -puN fs/namei.c~debug-missed-mnt_want_writes fs/namei.c
--- lxc/fs/namei.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-25 16:24:44.000000000 -0700
@@ -1626,7 +1626,7 @@ int may_open(struct nameidata *nd, int a
* effectively: !special_file()
* balanced by __fput()
*/
- error = mnt_want_write(nd->mnt);
+ error = mnt_want_persistent_write(nd->mnt, inode);
if (error)
return error;
}
@@ -1667,8 +1667,15 @@ int may_open(struct nameidata *nd, int a
error = locks_verify_locked(inode);
if (!error) {
DQUOT_INIT(inode);
-
+ /*
+ * we should have already acquired a persistent write, but
+ * this acquires an atomic one for us (because the fd
+ * hasn't been installed yet
+ */
+ error = mnt_want_write(nd->mnt);
+ WARN_ON(error);
error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
+ mnt_drop_write(nd->mnt);
}
put_write_access(inode);
if (error)
diff -puN fs/namespace.c~debug-missed-mnt_want_writes fs/namespace.c
--- lxc/fs/namespace.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/namespace.c 2007-09-25 15:45:06.000000000 -0700
@@ -18,6 +18,7 @@
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/cpumask.h>
+#include <linux/file.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -168,6 +169,8 @@ static inline void use_cpu_writer_for_mo
cpu_writer->mnt = mnt;
}
+#define MNT_WRITE_PERSISTENT 1
+#define MNT_WRITE_ATOMIC 2
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
@@ -186,10 +189,64 @@ static inline void use_cpu_writer_for_mo
* the write operation is finished, mnt_drop_write()
* must be called. This is effectively a refcount.
*/
-int mnt_want_write(struct vfsmount *mnt)
+void check_sb_for_writable_inode(struct super_block *sb, struct inode *inode)
+{
+ int found = 0;
+ struct list_head *p;
+ spin_lock(&sb->s_writable_inodes_lock);
+ list_for_each(p, &sb->s_writable_inodes) {
+ struct i_mnt_pair *pair =
+ list_entry(p, struct i_mnt_pair, list);
+ if (pair->inode != inode)
+ continue;
+ found = 1;
+ }
+ spin_unlock(&sb->s_writable_inodes_lock);
+ WARN_ON(!found);
+}
+void allow_writes_via_mnt_to(struct i_mnt_pair *pair)
+{
+ struct super_block *sb = pair->inode->i_sb;
+ spin_lock(&sb->s_writable_inodes_lock);
+ list_add(&pair->list, &sb->s_writable_inodes);
+ spin_unlock(&sb->s_writable_inodes_lock);
+}
+/*
+ * unfortunately, we need to pass the sb in here as well.
+ * One of the places we need to do this is near iput_final(),
+ * which means that 'inode' may be freed. We can view the
+ * pointer to it, but not dereference to get i_sb.
+ */
+struct i_mnt_pair *disallow_writes_via_mnt_to(struct vfsmount *mnt,
+ struct inode *inode, struct super_block *sb)
+{
+ int found = 0;
+ struct list_head *p;
+ struct i_mnt_pair *pair = NULL;
+
+ spin_lock(&sb->s_writable_inodes_lock);
+ list_for_each(p, &sb->s_writable_inodes) {
+ pair = list_entry(p, struct i_mnt_pair, list);
+ if (pair->mnt != mnt || pair->inode != inode)
+ continue;
+ found = 1;
+ list_del(&pair->list);
+ break;
+ }
+ spin_unlock(&sb->s_writable_inodes_lock);
+ if (!found) {
+ WARN_ON(1);
+ return NULL;
+ }
+ return pair;
+}
+int __mnt_want_write(struct vfsmount *mnt, int atomic_or_peristent, struct inode *inode)
{
int ret = 0;
struct mnt_writer *cpu_writer;
+ struct i_mnt_pair *pair = kmalloc(sizeof(*pair), GFP_KERNEL);
+ if (!pair)
+ return -ENOMEM;
cpu_writer = &get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);
@@ -199,12 +256,51 @@ int mnt_want_write(struct vfsmount *mnt)
}
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
+ if (atomic_or_peristent == MNT_WRITE_ATOMIC) {
+ int i;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ if (i >= 1 && printk_ratelimit()) {
+ printk("%s() saw can_write_via_mnt[%d]: %p\n", __func__, i,
+ current->can_write_via_mnt[i]);
+ dump_stack();
+ }
+ if (current->can_write_via_mnt[i])
+ continue;
+ current->can_write_via_mnt[i] = mnt;
+ break;
+ }
+ WARN_ON(i == CAN_WRITE_VIA_LEN);
+ } else if (atomic_or_peristent == MNT_WRITE_PERSISTENT) {
+ struct super_block *sb = inode->i_sb;
+ pair->mnt = mnt;
+ pair->inode = inode;
+ spin_lock(&sb->s_writable_inodes_lock);
+ list_add(&pair->list, &sb->s_writable_inodes);
+ spin_unlock(&sb->s_writable_inodes_lock);
+ pair = NULL;
+ }
out:
+ if (pair)
+ kfree(pair);
spin_unlock(&cpu_writer->lock);
put_cpu_var(mnt_writers);
return ret;
}
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret;
+ ret = __mnt_want_write(mnt, MNT_WRITE_ATOMIC, NULL);
+ if (ret)
+ return ret;
+ return ret;
+}
EXPORT_SYMBOL_GPL(mnt_want_write);
+int mnt_want_persistent_write(struct vfsmount *mnt, struct inode *inode)
+{
+ //printk("%s() mnt: %p '%s' %d\n", __func__, mnt, current->comm, current->pid);
+ return __mnt_want_write(mnt, MNT_WRITE_PERSISTENT, inode);
+}
+EXPORT_SYMBOL_GPL(mnt_want_persistent_write);
static void lock_and_coalesce_cpu_mnt_writer_counts(void)
{
@@ -248,7 +344,8 @@ static void handle_write_count_underflow
* performing writes to it. Must be matched with
* mnt_want_write() call above.
*/
-void mnt_drop_write(struct vfsmount *mnt)
+void __mnt_drop_write(struct vfsmount *mnt, int atomic_or_peristent,
+ struct inode *inode)
{
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;
@@ -263,6 +360,30 @@ void mnt_drop_write(struct vfsmount *mnt
must_check_underflow = 1;
atomic_dec(&mnt->__mnt_writers);
}
+ if (atomic_or_peristent == MNT_WRITE_ATOMIC) {
+ int i;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ if (current->can_write_via_mnt[i] != mnt)
+ continue;
+ current->can_write_via_mnt[i] = NULL;
+ break;
+ }
+ if (i >= CAN_WRITE_VIA_LEN) {
+ printk("i: %d len: %d mnt: %p\n", i, CAN_WRITE_VIA_LEN, mnt);
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ if (!current->can_write_via_mnt[i])
+ continue;
+ printk("current->can_write_via_mnt[%d]: %p\n", i,
+ current->can_write_via_mnt[i]);
+ }
+ WARN_ON(1);
+ }
+ } else if (atomic_or_peristent == MNT_WRITE_PERSISTENT) {
+ struct i_mnt_pair *pair = disallow_writes_via_mnt_to(mnt,
+ inode, inode->i_sb);
+ if (pair)
+ kfree(pair);
+ }
spin_unlock(&cpu_writer->lock);
/*
@@ -282,7 +403,18 @@ void mnt_drop_write(struct vfsmount *mnt
*/
put_cpu_var(mnt_writers);
}
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ //printk("%s() mnt: %p '%s' %d\n", __func__, mnt, current->comm, current->pid);
+ __mnt_drop_write(mnt, MNT_WRITE_ATOMIC, NULL);
+}
EXPORT_SYMBOL_GPL(mnt_drop_write);
+void mnt_drop_persistent_write(struct vfsmount *mnt, struct inode *inode)
+{
+ //printk("%s() mnt: %p '%s' %d\n", __func__, mnt, current->comm, current->pid);
+ __mnt_drop_write(mnt, MNT_WRITE_PERSISTENT, inode);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_persistent_write);
static int mnt_make_readonly(struct vfsmount *mnt)
{
diff -puN fs/super.c~debug-missed-mnt_want_writes fs/super.c
--- lxc/fs/super.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/fs/super.c 2007-09-25 15:27:43.000000000 -0700
@@ -67,6 +67,8 @@ static struct super_block *alloc_super(s
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
+ INIT_LIST_HEAD(&s->s_writable_inodes);
+ spin_lock_init(&s->s_writable_inodes_lock);
INIT_LIST_HEAD(&s->s_inodes);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
diff -puN include/linux/buffer_head.h~debug-missed-mnt_want_writes include/linux/buffer_head.h
--- lxc/include/linux/buffer_head.h~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/include/linux/buffer_head.h 2007-09-25 15:27:43.000000000 -0700
@@ -145,6 +145,7 @@ BUFFER_FNS(Unwritten, unwritten)
*/
void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+void FASTCALL(mark_buffer_dirty_nocheck(struct buffer_head *bh));
void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset);
diff -puN include/linux/file.h~debug-missed-mnt_want_writes include/linux/file.h
--- lxc/include/linux/file.h~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/include/linux/file.h 2007-09-25 15:27:43.000000000 -0700
@@ -53,6 +53,7 @@ struct files_struct {
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
+ atomic_t persistent_mnt_writers;
};
#define files_fdtable(files) (rcu_dereference((files)->fdt))
diff -puN include/linux/fs.h~debug-missed-mnt_want_writes include/linux/fs.h
--- lxc/include/linux/fs.h~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-25 15:27:43.000000000 -0700
@@ -1041,6 +1041,8 @@ struct super_block {
* in /proc/mounts will be "type.subtype"
*/
char *s_subtype;
+ struct list_head s_writable_inodes;
+ spinlock_t s_writable_inodes_lock;
};
extern struct timespec current_fs_time(struct super_block *sb);
diff -puN include/linux/mount.h~debug-missed-mnt_want_writes include/linux/mount.h
--- lxc/include/linux/mount.h~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/include/linux/mount.h 2007-09-25 15:57:45.000000000 -0700
@@ -83,12 +83,29 @@ static inline struct vfsmount *mntget(st
return mnt;
}
+struct inode;
extern int mnt_want_write(struct vfsmount *mnt);
extern void mnt_drop_write(struct vfsmount *mnt);
+extern int mnt_want_persistent_write(struct vfsmount *mnt, struct inode *inode);
+extern void mnt_drop_persistent_write(struct vfsmount *mnt, struct inode *inode);
extern void mntput_no_expire(struct vfsmount *mnt);
extern void mnt_pin(struct vfsmount *mnt);
extern void mnt_unpin(struct vfsmount *mnt);
extern int __mnt_is_readonly(struct vfsmount *mnt);
+extern void check_write_ability_to_inode(struct inode *inode);
+extern void check_sb_for_writable_inode(struct super_block *sb,
+ struct inode *inode);
+
+
+struct i_mnt_pair {
+ struct list_head list;
+ struct inode *inode;
+ struct vfsmount *mnt;
+};
+extern void allow_writes_via_mnt_to(struct i_mnt_pair *pair);
+extern struct i_mnt_pair *disallow_writes_via_mnt_to(struct vfsmount *mnt,
+ struct inode *inode,
+ struct super_block *sb);
static inline void mntput(struct vfsmount *mnt)
{
diff -puN include/linux/sched.h~debug-missed-mnt_want_writes include/linux/sched.h
--- lxc/include/linux/sched.h~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/include/linux/sched.h 2007-09-25 15:27:43.000000000 -0700
@@ -1123,6 +1123,8 @@ struct task_struct {
int make_it_fail;
#endif
struct prop_local_single dirties;
+#define CAN_WRITE_VIA_LEN 5
+ struct vfsmount *can_write_via_mnt[CAN_WRITE_VIA_LEN];
};
/*
diff -puN kernel/fork.c~debug-missed-mnt_want_writes kernel/fork.c
--- lxc/kernel/fork.c~debug-missed-mnt_want_writes 2007-09-25 15:27:43.000000000 -0700
+++ lxc-dave/kernel/fork.c 2007-09-25 15:27:43.000000000 -0700
@@ -112,6 +112,20 @@ void free_task(struct task_struct *tsk)
prop_local_destroy_single(&tsk->dirties);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
+ {
+ int i;
+ int bad = 0;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ struct vfsmount *mnt = tsk->can_write_via_mnt[i];
+ if (!mnt)
+ continue;
+ printk("task '%s' exited with mnt[%d]:%p present\n",
+ tsk->comm, i, mnt);
+ bad = 1;
+ }
+ if (bad)
+ panic("bad can write stuff");
+ }
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -181,6 +195,12 @@ static struct task_struct *dup_task_stru
}
*tsk = *orig;
+ {
+ int i;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ tsk->can_write_via_mnt[i] = NULL;
+ }
+ }
tsk->stack = ti;
err = prop_local_init_single(&tsk->dirties);
@@ -203,6 +223,16 @@ static struct task_struct *dup_task_stru
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
+ {
+ int i;
+ for (i=0; i < CAN_WRITE_VIA_LEN; i++) {
+ struct vfsmount *mnt = tsk->can_write_via_mnt[i];
+ if (!mnt)
+ continue;
+ printk("task later %d mnt[%d]:%p present\n",
+ tsk->pid, i, mnt);
+ }
+ }
return tsk;
}
@@ -657,6 +687,7 @@ static struct files_struct *alloc_files(
goto out;
atomic_set(&newf->count, 1);
+ atomic_set(&newf->persistent_mnt_writers, 0);
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
diff -puN arch/i386/kernel/entry.S~debug-missed-mnt_want_writes arch/i386/kernel/entry.S
diff -puN arch/i386/kernel/irq.c~debug-missed-mnt_want_writes arch/i386/kernel/irq.c
diff -puN include/asm-i386/paravirt.h~debug-missed-mnt_want_writes include/asm-i386/paravirt.h
diff -puN include/asm-i386/irqflags.h~debug-missed-mnt_want_writes include/asm-i386/irqflags.h
diff -puN fs/ext4/inode.c~debug-missed-mnt_want_writes fs/ext4/inode.c
--- lxc/fs/ext4/inode.c~debug-missed-mnt_want_writes 2007-09-25 15:55:59.000000000 -0700
+++ lxc-dave/fs/ext4/inode.c 2007-09-25 15:56:03.000000000 -0700
@@ -3260,6 +3260,7 @@ int ext4_mark_inode_dirty(handle_t *hand
int err, ret;
might_sleep();
+ check_write_ability_to_inode(inode);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
diff -puN fs/ocfs2/inode.c~debug-missed-mnt_want_writes fs/ocfs2/inode.c
--- lxc/fs/ocfs2/inode.c~debug-missed-mnt_want_writes 2007-09-25 15:56:06.000000000 -0700
+++ lxc-dave/fs/ocfs2/inode.c 2007-09-25 15:56:10.000000000 -0700
@@ -1211,6 +1211,7 @@ int ocfs2_mark_inode_dirty(handle_t *han
int status;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+ check_write_ability_to_inode(inode);
mlog_entry("(inode %llu)\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
diff -puN arch/i386/kernel/syscall_table.S~debug-missed-mnt_want_writes arch/i386/kernel/syscall_table.S
diff -puN fs/nfsctl.c~debug-missed-mnt_want_writes fs/nfsctl.c
_
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-20 19:52 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Dave Hansen
2007-09-21 8:17 ` Andrew Morton
@ 2007-09-21 23:03 ` Andrew Morton
2007-09-21 23:39 ` Dave Hansen
1 sibling, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-09-21 23:03 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Thu, 20 Sep 2007 12:52:57 -0700
Dave Hansen <haveblue@us.ibm.com> wrote:
> Some ioctl()s can cause writes to the filesystem. Take
> these, and make them use mnt_want/drop_write() instead.
>
> We need to pass the filp one layer deeper in XFS, but
> somebody _just_ pulled it out in February because nobody
> was using it, so I don't feel guilty for adding it back.
Note that -mm's ext2-reservations.patch adds EXT2_IOC_SETRSVSZ,
and it doesn't do mnt_want_write().
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-21 23:03 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Andrew Morton
@ 2007-09-21 23:39 ` Dave Hansen
2007-09-21 23:47 ` Andrew Morton
0 siblings, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-21 23:39 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch
On Fri, 2007-09-21 at 16:03 -0700, Andrew Morton wrote:
> Dave Hansen <haveblue@us.ibm.com> wrote:
>
> > Some ioctl()s can cause writes to the filesystem. Take
> > these, and make them use mnt_want/drop_write() instead.
> >
> > We need to pass the filp one layer deeper in XFS, but
> > somebody _just_ pulled it out in February because nobody
> > was using it, so I don't feel guilty for adding it back.
>
> Note that -mm's ext2-reservations.patch adds EXT2_IOC_SETRSVSZ,
> and it doesn't do mnt_want_write().
That doesn't quite apply to mainline (at least after the patches I just
sent). I'll wait and send you one on top of the next -mm so that I can
get a coherent view of what's going on if that's all right.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls
2007-09-21 23:39 ` Dave Hansen
@ 2007-09-21 23:47 ` Andrew Morton
0 siblings, 0 replies; 48+ messages in thread
From: Andrew Morton @ 2007-09-21 23:47 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Fri, 21 Sep 2007 16:39:40 -0700
Dave Hansen <haveblue@us.ibm.com> wrote:
> On Fri, 2007-09-21 at 16:03 -0700, Andrew Morton wrote:
> > Dave Hansen <haveblue@us.ibm.com> wrote:
> >
> > > Some ioctl()s can cause writes to the filesystem. Take
> > > these, and make them use mnt_want/drop_write() instead.
> > >
> > > We need to pass the filp one layer deeper in XFS, but
> > > somebody _just_ pulled it out in February because nobody
> > > was using it, so I don't feel guilty for adding it back.
> >
> > Note that -mm's ext2-reservations.patch adds EXT2_IOC_SETRSVSZ,
> > and it doesn't do mnt_want_write().
>
> That doesn't quite apply to mainline (at least after the patches I just
> sent). I'll wait and send you one on top of the next -mm so that I can
> get a coherent view of what's going on if that's all right.
>
Sure, that's OK.
But I only noticed it because I happened to have my nose in there fixing a
reject.
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH 08/25] elevate writer count for chown and friends
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (6 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 07/25] r/o bind mounts: elevate write count for some ioctls Dave Hansen
@ 2007-09-20 19:52 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 09/25] make access() use mnt check Dave Hansen
` (16 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:52 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
chown/chmod,etc... don't call permission in the same way
that the normal "open for write" calls do. They still
write to the filesystem, so bump the write count during
these operations.
This conflicts with the current (~2.6.23-rc7) audit
git tree in -mm. wiggle'ing the patch merges it.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/open.c | 39 ++++++++++++++++++++++++++++++---------
1 file changed, 30 insertions(+), 9 deletions(-)
diff -puN fs/open.c~elevate-writer-count-for-chown-and-friends fs/open.c
--- lxc/fs/open.c~elevate-writer-count-for-chown-and-friends 2007-09-20 12:16:13.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-20 12:16:13.000000000 -0700
@@ -571,12 +571,12 @@ asmlinkage long sys_fchmod(unsigned int
audit_inode(NULL, inode);
- err = -EROFS;
- if (IS_RDONLY(inode))
+ err = mnt_want_write(file->f_vfsmnt);
+ if (err)
goto out_putf;
err = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_putf;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
@@ -585,6 +585,8 @@ asmlinkage long sys_fchmod(unsigned int
err = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(file->f_vfsmnt);
out_putf:
fput(file);
out:
@@ -604,13 +606,13 @@ asmlinkage long sys_fchmodat(int dfd, co
goto out;
inode = nd.dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(nd.mnt);
+ if (error)
goto dput_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
@@ -620,6 +622,8 @@ asmlinkage long sys_fchmodat(int dfd, co
error = notify_change(nd.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(nd.mnt);
dput_and_out:
path_release(&nd);
out:
@@ -642,9 +646,6 @@ static int chown_common(struct dentry *
printk(KERN_ERR "chown_common: NULL inode\n");
goto out;
}
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
goto out;
@@ -674,7 +675,12 @@ asmlinkage long sys_chown(const char __u
error = user_path_walk(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+ mnt_drop_write(nd.mnt);
+out_release:
path_release(&nd);
out:
return error;
@@ -694,7 +700,12 @@ asmlinkage long sys_fchownat(int dfd, co
error = __user_walk_fd(dfd, filename, follow, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+ mnt_drop_write(nd.mnt);
+out_release:
path_release(&nd);
out:
return error;
@@ -708,7 +719,12 @@ asmlinkage long sys_lchown(const char __
error = user_path_walk_link(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.dentry, user, group);
+ mnt_drop_write(nd.mnt);
+out_release:
path_release(&nd);
out:
return error;
@@ -725,9 +741,14 @@ asmlinkage long sys_fchown(unsigned int
if (!file)
goto out;
+ error = mnt_want_write(file->f_vfsmnt);
+ if (error)
+ goto out_fput;
dentry = file->f_path.dentry;
audit_inode(NULL, dentry->d_inode);
error = chown_common(dentry, user, group);
+ mnt_drop_write(file->f_vfsmnt);
+out_fput:
fput(file);
out:
return error;
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 09/25] make access() use mnt check
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (7 preceding siblings ...)
2007-09-20 19:52 ` [PATCH 08/25] elevate writer count for chown and friends Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 10/25] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
` (15 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
It is OK to let access() go without using a mnt_want/drop_write()
pair because it doesn't actually do writes to the filesystem,
and it is inherently racy anyway. This is a rare case when it is
OK to use __mnt_is_readonly() directly.
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/open.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff -puN fs/open.c~make-access-use-helper fs/open.c
--- lxc/fs/open.c~make-access-use-helper 2007-09-20 12:16:13.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-20 12:16:13.000000000 -0700
@@ -457,8 +457,17 @@ asmlinkage long sys_faccessat(int dfd, c
if(res || !(mode & S_IWOTH) ||
special_file(nd.dentry->d_inode->i_mode))
goto out_path_release;
-
- if(IS_RDONLY(nd.dentry->d_inode))
+ /*
+ * This is a rare case where using __mnt_is_readonly()
+ * is OK without a mnt_want/drop_write() pair. Since
+ * no actual write to the fs is performed here, we do
+ * not need to telegraph to that to anyone.
+ *
+ * By doing this, we accept that this access is
+ * inherently racy and know that the fs may change
+ * state before we even see this result.
+ */
+ if (__mnt_is_readonly(nd.mnt))
res = -EROFS;
out_path_release:
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 10/25] elevate mnt writers for callers of vfs_mkdir()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (8 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 09/25] make access() use mnt check Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 11/25] elevate write count during entire ncp_ioctl() Dave Hansen
` (14 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Pretty self-explanatory. Fits in with the rest of the series.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/namei.c | 5 +++++
lxc-dave/fs/nfsd/nfs4recover.c | 4 ++++
2 files changed, 9 insertions(+)
diff -puN fs/namei.c~elevate-mnt-writers-for-callers-of-vfs-mkdir fs/namei.c
--- lxc/fs/namei.c~elevate-mnt-writers-for-callers-of-vfs-mkdir 2007-09-20 12:16:14.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:14.000000000 -0700
@@ -2026,7 +2026,12 @@ asmlinkage long sys_mkdirat(int dfd, con
if (!IS_POSIXACL(nd.dentry->d_inode))
mode &= ~current->fs->umask;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
diff -puN fs/nfsd/nfs4recover.c~elevate-mnt-writers-for-callers-of-vfs-mkdir fs/nfsd/nfs4recover.c
--- lxc/fs/nfsd/nfs4recover.c~elevate-mnt-writers-for-callers-of-vfs-mkdir 2007-09-20 12:16:14.000000000 -0700
+++ lxc-dave/fs/nfsd/nfs4recover.c 2007-09-20 12:16:14.000000000 -0700
@@ -156,7 +156,11 @@ nfsd4_create_clid_dir(struct nfs4_client
dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
goto out_put;
}
+ status = mnt_want_write(rec_dir.mnt);
+ if (status)
+ goto out_put;
status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+ mnt_drop_write(rec_dir.mnt);
out_put:
dput(dentry);
out_unlock:
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 11/25] elevate write count during entire ncp_ioctl()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (9 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 10/25] elevate mnt writers for callers of vfs_mkdir() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 12/25] elevate write count for link and symlink calls Dave Hansen
` (13 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Some ioctls need write access, but others don't. Make a helper
function to decide when write access is needed, and take it.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/ncpfs/ioctl.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
diff -puN fs/ncpfs/ioctl.c~elevate-write-count-during-entire-ncp-ioctl fs/ncpfs/ioctl.c
--- lxc/fs/ncpfs/ioctl.c~elevate-write-count-during-entire-ncp-ioctl 2007-09-20 12:16:15.000000000 -0700
+++ lxc-dave/fs/ncpfs/ioctl.c 2007-09-20 12:16:15.000000000 -0700
@@ -14,6 +14,7 @@
#include <linux/ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/highuid.h>
#include <linux/smp_lock.h>
#include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* serv
}
#endif /* CONFIG_NCPFS_NLS */
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
return -EINVAL;
}
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+ switch (cmd) {
+ case NCP_IOC_GET_FS_INFO:
+ case NCP_IOC_GET_FS_INFO_V2:
+ case NCP_IOC_NCPREQUEST:
+ case NCP_IOC_SETDENTRYTTL:
+ case NCP_IOC_SIGN_INIT:
+ case NCP_IOC_LOCKUNLOCK:
+ case NCP_IOC_SET_SIGN_WANTED:
+ return 1;
+ case NCP_IOC_GETOBJECTNAME:
+ case NCP_IOC_SETOBJECTNAME:
+ case NCP_IOC_GETPRIVATEDATA:
+ case NCP_IOC_SETPRIVATEDATA:
+ case NCP_IOC_SETCHARSETS:
+ case NCP_IOC_GETCHARSETS:
+ case NCP_IOC_CONN_LOGGED_IN:
+ case NCP_IOC_GETDENTRYTTL:
+ case NCP_IOC_GETMOUNTUID2:
+ case NCP_IOC_SIGN_WANTED:
+ case NCP_IOC_GETROOT:
+ case NCP_IOC_SETROOT:
+ return 0;
+ default:
+ /* unkown IOCTL command, assume write */
+ }
+ return 1;
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ if (ncp_ioctl_need_write(cmd)) {
+ /*
+ * inside the ioctl(), any failures which
+ * are because of file_permission() are
+ * -EACCESS, so it seems consistent to keep
+ * that here.
+ */
+ if (mnt_want_write(filp->f_vfsmnt))
+ return -EACCES;
+ }
+ ret = __ncp_ioctl(inode, filp, cmd, arg);
+ if (ncp_ioctl_need_write(cmd))
+ mnt_drop_write(filp->f_vfsmnt);
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 12/25] elevate write count for link and symlink calls
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (10 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 11/25] elevate write count during entire ncp_ioctl() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 13/25] elevate mount count for extended attributes Dave Hansen
` (12 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff -puN fs/namei.c~elevate-write-count-for-link-and-symlink-calls fs/namei.c
--- lxc/fs/namei.c~elevate-write-count-for-link-and-symlink-calls 2007-09-20 12:16:15.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:15.000000000 -0700
@@ -2299,7 +2299,12 @@ asmlinkage long sys_symlinkat(const char
if (IS_ERR(dentry))
goto out_unlock;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
@@ -2394,7 +2399,12 @@ asmlinkage long sys_linkat(int olddfd, c
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out_unlock;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+ mnt_drop_write(nd.mnt);
+out_dput:
dput(new_dentry);
out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 13/25] elevate mount count for extended attributes
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (11 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 12/25] elevate write count for link and symlink calls Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 14/25] elevate write count for file_update_time() Dave Hansen
` (11 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
This basically audits the callers of xattr_permission(), which
calls permission() and can perform writes to the filesystem.
This conflicts with the current (~2.6.23-rc7) audit
git tree in -mm. wiggle'ing the patch merges it.
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/nfsd/nfs4proc.c | 7 ++++++-
lxc-dave/fs/xattr.c | 16 ++++++++++++++--
2 files changed, 20 insertions(+), 3 deletions(-)
diff -puN fs/nfsd/nfs4proc.c~elevate-mount-count-for-extended-attributes fs/nfsd/nfs4proc.c
--- lxc/fs/nfsd/nfs4proc.c~elevate-mount-count-for-extended-attributes 2007-09-20 12:16:16.000000000 -0700
+++ lxc-dave/fs/nfsd/nfs4proc.c 2007-09-20 12:16:16.000000000 -0700
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
return status;
}
}
+ status = mnt_want_write(cstate->current_fh.fh_export->ex_mnt);
+ if (status)
+ return status;
status = nfs_ok;
if (setattr->sa_acl != NULL)
status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
setattr->sa_acl);
if (status)
- return status;
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
+out:
+ mnt_drop_write(cstate->current_fh.fh_export->ex_mnt);
return status;
}
diff -puN fs/xattr.c~elevate-mount-count-for-extended-attributes fs/xattr.c
--- lxc/fs/xattr.c~elevate-mount-count-for-extended-attributes 2007-09-20 12:16:16.000000000 -0700
+++ lxc-dave/fs/xattr.c 2007-09-20 12:16:16.000000000 -0700
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, co
* filesystem or on an immutable / append-only inode.
*/
if (mask & MAY_WRITE) {
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
}
@@ -235,7 +234,11 @@ sys_setxattr(char __user *path, char __u
error = user_path_walk(path, &nd);
if (error)
return error;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ return error;
error = setxattr(nd.dentry, name, value, size, flags);
+ mnt_drop_write(nd.mnt);
path_release(&nd);
return error;
}
@@ -250,7 +253,11 @@ sys_lsetxattr(char __user *path, char __
error = user_path_walk_link(path, &nd);
if (error)
return error;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ return error;
error = setxattr(nd.dentry, name, value, size, flags);
+ mnt_drop_write(nd.mnt);
path_release(&nd);
return error;
}
@@ -266,9 +273,14 @@ sys_fsetxattr(int fd, char __user *name,
f = fget(fd);
if (!f)
return error;
+ error = mnt_want_write(f->f_vfsmnt);
+ if (error)
+ goto out_fput;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry->d_inode);
error = setxattr(dentry, name, value, size, flags);
+ mnt_drop_write(f->f_vfsmnt);
+out_fput:
fput(f);
return error;
}
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 14/25] elevate write count for file_update_time()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (12 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 13/25] elevate mount count for extended attributes Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 15/25] unix_find_other() elevate write count for touch_atime() Dave Hansen
` (10 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/inode.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff -puN fs/inode.c~elevate-write-count-for-file_update_time fs/inode.c
--- lxc/fs/inode.c~elevate-write-count-for-file_update_time 2007-09-20 12:16:16.000000000 -0700
+++ lxc-dave/fs/inode.c 2007-09-20 12:16:16.000000000 -0700
@@ -1232,10 +1232,19 @@ void file_update_time(struct file *file)
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
int sync_it = 0;
+ int err = 0;
if (IS_NOCMTIME(inode))
return;
- if (IS_RDONLY(inode))
+ /*
+ * Ideally, we want to guarantee that 'f_vfsmnt'
+ * is non-NULL here. But, NFS exports need to
+ * be fixed up before we can do that. So, check
+ * it for now. - Dave Hansen
+ */
+ if (file->f_vfsmnt)
+ err = mnt_want_write(file->f_vfsmnt);
+ if (err)
return;
now = current_fs_time(inode->i_sb);
@@ -1251,6 +1260,8 @@ void file_update_time(struct file *file)
if (sync_it)
mark_inode_dirty_sync(inode);
+ if (file->f_vfsmnt)
+ mnt_drop_write(file->f_vfsmnt);
}
EXPORT_SYMBOL(file_update_time);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 15/25] unix_find_other() elevate write count for touch_atime()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (13 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 14/25] elevate write count for file_update_time() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 16/25] elevate write count over calls to vfs_rename() Dave Hansen
` (9 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/net/unix/af_unix.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff -puN net/unix/af_unix.c~unix-find-other-elevate-write-count-for-touch-atime net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~unix-find-other-elevate-write-count-for-touch-atime 2007-09-20 12:16:17.000000000 -0700
+++ lxc-dave/net/unix/af_unix.c 2007-09-20 12:16:17.000000000 -0700
@@ -729,21 +729,27 @@ static struct sock *unix_find_other(stru
err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
if (err)
goto fail;
+
+ err = mnt_want_write(nd.mnt);
+ if (err)
+ goto put_path_fail;
+
err = vfs_permission(&nd, MAY_WRITE);
if (err)
- goto put_fail;
+ goto mnt_drop_write_fail;
err = -ECONNREFUSED;
if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
- goto put_fail;
+ goto mnt_drop_write_fail;
u=unix_find_socket_byinode(nd.dentry->d_inode);
if (!u)
- goto put_fail;
+ goto mnt_drop_write_fail;
if (u->sk_type == type)
touch_atime(nd.mnt, nd.dentry);
path_release(&nd);
+ mnt_drop_write(nd.mnt);
err=-EPROTOTYPE;
if (u->sk_type != type) {
@@ -763,7 +769,9 @@ static struct sock *unix_find_other(stru
}
return u;
-put_fail:
+mnt_drop_write_fail:
+ mnt_drop_write(nd.mnt);
+put_path_fail:
path_release(&nd);
fail:
*error=err;
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 16/25] elevate write count over calls to vfs_rename()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (14 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 15/25] unix_find_other() elevate write count for touch_atime() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 17/25] nfs: check mnt instead of superblock directly Dave Hansen
` (8 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
This also uses the little helper in the NFS code to
make an if() a little bit less ugly. We introduced
the helper at the beginning of the series.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/namei.c | 4 ++++
lxc-dave/fs/nfsd/vfs.c | 15 +++++++++++----
2 files changed, 15 insertions(+), 4 deletions(-)
diff -puN fs/namei.c~elevate-write-count-over-calls-to-vfs-rename fs/namei.c
--- lxc/fs/namei.c~elevate-write-count-over-calls-to-vfs-rename 2007-09-20 12:16:18.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:18.000000000 -0700
@@ -2630,8 +2630,12 @@ static int do_rename(int olddfd, const c
if (new_dentry == trap)
goto exit5;
+ error = mnt_want_write(oldnd.mnt);
+ if (error)
+ goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
+ mnt_drop_write(oldnd.mnt);
exit5:
dput(new_dentry);
exit4:
diff -puN fs/nfsd/vfs.c~elevate-write-count-over-calls-to-vfs-rename fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~elevate-write-count-over-calls-to-vfs-rename 2007-09-20 12:16:18.000000000 -0700
+++ lxc-dave/fs/nfsd/vfs.c 2007-09-20 12:16:18.000000000 -0700
@@ -1651,13 +1651,20 @@ nfsd_rename(struct svc_rqst *rqstp, stru
if (ndentry == trap)
goto out_dput_new;
-#ifdef MSNFS
- if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ if (svc_msnfs(ffhp) &&
((atomic_read(&odentry->d_count) > 1)
|| (atomic_read(&ndentry->d_count) > 1))) {
host_err = -EPERM;
- } else
-#endif
+ goto out_dput_new;
+ }
+
+ host_err = -EXDEV;
+ if (ffhp->fh_export->ex_mnt != tfhp->fh_export->ex_mnt)
+ goto out_dput_new;
+ host_err = mnt_want_write(ffhp->fh_export->ex_mnt);
+ if (host_err)
+ goto out_dput_new;
+
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
host_err = nfsd_sync_dir(tdentry);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 17/25] nfs: check mnt instead of superblock directly
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (15 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 16/25] elevate write count over calls to vfs_rename() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 18/25] elevate writer count for do_sys_truncate() Dave Hansen
` (7 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
If we depend on the inodes for writeability, we will not
catch the r/o mounts when implemented.
This patches uses __mnt_want_write(). It does not guarantee
that the mount will stay writeable after the check. But,
this is OK for one of the checks because it is just for a
printk().
The other two are probably unnecessary and duplicate existing
checks in the VFS. This won't make them better checks than
before, but it will make them detect r/o mounts.
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/nfs/dir.c | 3 ++-
lxc-dave/fs/nfsd/vfs.c | 4 ++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff -puN fs/nfs/dir.c~nfs-check-mnt-instead-of-sb fs/nfs/dir.c
--- lxc/fs/nfs/dir.c~nfs-check-mnt-instead-of-sb 2007-09-20 12:16:18.000000000 -0700
+++ lxc-dave/fs/nfs/dir.c 2007-09-20 12:16:18.000000000 -0700
@@ -997,7 +997,8 @@ static int is_atomic_open(struct inode *
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
/* Are we trying to write to a read only partition? */
- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+ if (__mnt_is_readonly(nd->mnt) &&
+ (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
return 0;
return 1;
}
diff -puN fs/nfsd/vfs.c~nfs-check-mnt-instead-of-sb fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~nfs-check-mnt-instead-of-sb 2007-09-20 12:16:18.000000000 -0700
+++ lxc-dave/fs/nfsd/vfs.c 2007-09-20 12:16:18.000000000 -0700
@@ -1845,7 +1845,7 @@ nfsd_permission(struct svc_rqst *rqstp,
inode->i_mode,
IS_IMMUTABLE(inode)? " immut" : "",
IS_APPEND(inode)? " append" : "",
- IS_RDONLY(inode)? " ro" : "");
+ __mnt_is_readonly(exp->ex_mnt)? " ro" : "");
dprintk(" owner %d/%d user %d/%d\n",
inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
#endif
@@ -1856,7 +1856,7 @@ nfsd_permission(struct svc_rqst *rqstp,
*/
if (!(acc & MAY_LOCAL_ACCESS))
if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
- if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
+ if (exp_rdonly(rqstp, exp) || __mnt_is_readonly(exp->ex_mnt))
return nfserr_rofs;
if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
return nfserr_perm;
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 18/25] elevate writer count for do_sys_truncate()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (16 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 17/25] nfs: check mnt instead of superblock directly Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 19/25] elevate write count for do_utimes() Dave Hansen
` (6 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/open.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff -puN fs/open.c~elevate-writer-count-for-do-sys-truncate fs/open.c
--- lxc/fs/open.c~elevate-writer-count-for-do-sys-truncate 2007-09-20 12:16:19.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-20 12:16:19.000000000 -0700
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char _
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = vfs_permission(&nd, MAY_WRITE);
+ error = mnt_want_write(nd.mnt);
if (error)
goto dput_and_out;
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
+ error = vfs_permission(&nd, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = get_write_access(inode);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
/*
* Make sure that there are no leases. get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char _
put_write_and_out:
put_write_access(inode);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.mnt);
dput_and_out:
path_release(&nd);
out:
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 19/25] elevate write count for do_utimes()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (17 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 18/25] elevate writer count for do_sys_truncate() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 20/25] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
` (5 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/utimes.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff -puN fs/utimes.c~elevate-write-count-for-do-utimes fs/utimes.c
--- lxc/fs/utimes.c~elevate-write-count-for-do-utimes 2007-09-20 12:16:20.000000000 -0700
+++ lxc-dave/fs/utimes.c 2007-09-20 12:16:20.000000000 -0700
@@ -2,6 +2,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/linkage.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/stat.h>
@@ -75,8 +76,8 @@ long do_utimes(int dfd, char __user *fil
inode = dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(nd.mnt);
+ if (error)
goto dput_and_out;
/* Don't worry, the checks are done in inode_change_ok() */
@@ -84,7 +85,7 @@ long do_utimes(int dfd, char __user *fil
if (times) {
error = -EPERM;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (times[0].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_ATIME;
@@ -104,22 +105,24 @@ long do_utimes(int dfd, char __user *fil
} else {
error = -EACCES;
if (IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
if (f) {
if (!(f->f_mode & FMODE_WRITE))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
} else {
error = vfs_permission(&nd, MAY_WRITE);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
}
}
}
mutex_lock(&inode->i_mutex);
error = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.mnt);
dput_and_out:
if (f)
fput(f);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 20/25] elevate write count for do_sys_utime() and touch_atime()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (18 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 19/25] elevate write count for do_utimes() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 21/25] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
` (4 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/inode.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff -puN fs/inode.c~elevate-write-count-for-do-sys-utime-and-touch-atime fs/inode.c
--- lxc/fs/inode.c~elevate-write-count-for-do-sys-utime-and-touch-atime 2007-09-20 12:16:20.000000000 -0700
+++ lxc-dave/fs/inode.c 2007-09-20 12:16:20.000000000 -0700
@@ -1176,22 +1176,23 @@ void touch_atime(struct vfsmount *mnt, s
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (inode->i_flags & S_NOATIME)
+ if (mnt && mnt_want_write(mnt))
return;
+ if (inode->i_flags & S_NOATIME)
+ goto out;
if (IS_NOATIME(inode))
- return;
+ goto out;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ goto out;
/*
* We may have a NULL vfsmount when coming from NFSD
*/
if (mnt) {
if (mnt->mnt_flags & MNT_NOATIME)
- return;
+ goto out;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
-
+ goto out;
if (mnt->mnt_flags & MNT_RELATIME) {
/*
* With relative atime, only update atime if the
@@ -1202,16 +1203,19 @@ void touch_atime(struct vfsmount *mnt, s
&inode->i_atime) < 0 &&
timespec_compare(&inode->i_ctime,
&inode->i_atime) < 0)
- return;
+ goto out;
}
}
now = current_fs_time(inode->i_sb);
if (timespec_equal(&inode->i_atime, &now))
- return;
+ goto out;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
+out:
+ if (mnt)
+ mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 21/25] sys_mknodat(): elevate write count for vfs_mknod/create()
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (19 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 20/25] elevate write count for do_sys_utime() and touch_atime() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 22/25] elevate mnt writers for vfs_unlink() callers Dave Hansen
` (3 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
This takes care of all of the direct callers of vfs_mknod().
Since a few of these cases also handle normal file creation
as well, this also covers some calls to vfs_create().
So that we don't have to make three mnt_want/drop_write()
calls inside of the switch statement, we move some of its
logic outside of the switch.
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 32 +++++++++++++++++++++-----------
lxc-dave/fs/nfsd/vfs.c | 4 ++++
lxc-dave/net/unix/af_unix.c | 4 ++++
3 files changed, 29 insertions(+), 11 deletions(-)
diff -puN fs/namei.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/namei.c
--- lxc/fs/namei.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-09-20 12:16:21.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:21.000000000 -0700
@@ -1945,12 +1945,25 @@ asmlinkage long sys_mknodat(int dfd, con
if (error)
goto out;
dentry = lookup_create(&nd, 0);
- error = PTR_ERR(dentry);
-
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
if (!IS_POSIXACL(nd.dentry->d_inode))
mode &= ~current->fs->umask;
- if (!IS_ERR(dentry)) {
- switch (mode & S_IFMT) {
+ if (S_ISDIR(mode)) {
+ error = -EPERM;
+ goto out_dput;
+ }
+ if (!S_ISREG(mode) && !S_ISCHR(mode) && !S_ISBLK(mode) &&
+ !S_ISFIFO(mode) && !S_ISSOCK(mode) && mode != 0) {
+ error = -EINVAL;
+ goto out_dput;
+ }
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto out_dput;
+ switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
break;
@@ -1961,14 +1974,11 @@ asmlinkage long sys_mknodat(int dfd, con
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(nd.dentry->d_inode,dentry,mode,0);
break;
- case S_IFDIR:
- error = -EPERM;
- break;
- default:
- error = -EINVAL;
- }
- dput(dentry);
}
+ mnt_drop_write(nd.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
path_release(&nd);
out:
diff -puN fs/nfsd/vfs.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create fs/nfsd/vfs.c
--- lxc/fs/nfsd/vfs.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-09-20 12:16:21.000000000 -0700
+++ lxc-dave/fs/nfsd/vfs.c 2007-09-20 12:16:21.000000000 -0700
@@ -1225,7 +1225,11 @@ nfsd_create(struct svc_rqst *rqstp, stru
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
+ host_err = mnt_want_write(fhp->fh_export->ex_mnt);
+ if (host_err)
+ break;
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+ mnt_drop_write(fhp->fh_export->ex_mnt);
break;
default:
printk("nfsd: bad file type %o in nfsd_create\n", type);
diff -puN net/unix/af_unix.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create net/unix/af_unix.c
--- lxc/net/unix/af_unix.c~sys-mknodat-elevate-write-count-for-vfs-mknod-create 2007-09-20 12:16:21.000000000 -0700
+++ lxc-dave/net/unix/af_unix.c 2007-09-20 12:16:21.000000000 -0700
@@ -842,7 +842,11 @@ static int unix_bind(struct socket *sock
*/
mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+ err = mnt_want_write(nd.mnt);
+ if (err)
+ goto out_mknod_dput;
err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
+ mnt_drop_write(nd.mnt);
if (err)
goto out_mknod_dput;
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 22/25] elevate mnt writers for vfs_unlink() callers
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (20 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 21/25] sys_mknodat(): elevate write count for vfs_mknod/create() Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 23/25] do_rmdir(): elevate write count Dave Hansen
` (2 subsequent siblings)
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 4 ++++
lxc-dave/ipc/mqueue.c | 5 ++++-
2 files changed, 8 insertions(+), 1 deletion(-)
diff -puN fs/namei.c~elevate-mnt-writers-for-vfs-unlink-callers fs/namei.c
--- lxc/fs/namei.c~elevate-mnt-writers-for-vfs-unlink-callers 2007-09-20 12:16:21.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:21.000000000 -0700
@@ -2228,7 +2228,11 @@ static long do_unlinkat(int dfd, const c
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto exit2;
error = vfs_unlink(nd.dentry->d_inode, dentry);
+ mnt_drop_write(nd.mnt);
exit2:
dput(dentry);
}
diff -puN ipc/mqueue.c~elevate-mnt-writers-for-vfs-unlink-callers ipc/mqueue.c
--- lxc/ipc/mqueue.c~elevate-mnt-writers-for-vfs-unlink-callers 2007-09-20 12:16:21.000000000 -0700
+++ lxc-dave/ipc/mqueue.c 2007-09-20 12:16:21.000000000 -0700
@@ -750,8 +750,11 @@ asmlinkage long sys_mq_unlink(const char
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
-
+ err = mnt_want_write(mqueue_mnt);
+ if (err)
+ goto out_err;
err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+ mnt_drop_write(mqueue_mnt);
out_err:
dput(dentry);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 23/25] do_rmdir(): elevate write count
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (21 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 22/25] elevate mnt writers for vfs_unlink() callers Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-20 19:53 ` [PATCH 24/25] r/o bind mounts: track number of mount writers Dave Hansen
2007-09-20 19:53 ` [PATCH 25/25] honor r/w changes at do_remount() time Dave Hansen
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Elevate the write count during the vfs_rmdir() call.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
lxc-dave/fs/namei.c | 5 +++++
1 file changed, 5 insertions(+)
diff -puN fs/namei.c~do-rmdir-elevate-write-count fs/namei.c
--- lxc/fs/namei.c~do-rmdir-elevate-write-count 2007-09-20 12:16:22.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-20 12:16:22.000000000 -0700
@@ -2148,7 +2148,12 @@ static long do_rmdir(int dfd, const char
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
+ error = mnt_want_write(nd.mnt);
+ if (error)
+ goto exit3;
error = vfs_rmdir(nd.dentry->d_inode, dentry);
+ mnt_drop_write(nd.mnt);
+exit3:
dput(dentry);
exit2:
mutex_unlock(&nd.dentry->d_inode->i_mutex);
_
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (22 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 23/25] do_rmdir(): elevate write count Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
2007-09-24 6:17 ` Andrew Morton
2007-09-24 17:54 ` Christoph Hellwig
2007-09-20 19:53 ` [PATCH 25/25] honor r/w changes at do_remount() time Dave Hansen
24 siblings, 2 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
If we can't pull just this patch in for now, it would be
great to get everything leading up to here pulled in. I've
re-implemented this several ways, and it has never caused
the preceeding patches to change at all.
--
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can
be hundreds of cpus doing open()/close() on files on the
same mnt at the same time. Even an atomic_t in the mnt
has massive scalaing problems because the cacheline gets
so terribly contended.
This uses a statically-allocated percpu variable. All
operations are local to a cpu as long that cpu operates on
the same mount, and there are no writer count imbalances.
Writer count imbalances happen when a write is taken on one
cpu, and released on another, like when an open/close pair
is performed on two different cpus because the task moved.
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we
determine if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple
of seconds in several cpus in parallel doing open/write/close
loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch.
It does opens on a _pair_ of files in two different mounts
in parallel. This should cause my code to lose its "operate
on the same mount" optimization completely. This worst-case
scenario causes a 3% degredation in the benchmark.
I could probably get rid of even this 3%, but it would be
more complex than what I have here, and I think this is
getting into acceptable territory. In practice, I expect
writing more than 3 bytes to a file, as well as disk I/O
to mask any effects that this has.
(To get rid of that 3%, we could have an #defined number
of mounts in the percpu variable. So, instead of a CPU
getting operate only on percpu data when it accesses
only one mount, it could stay on percpu data when it
only accesses N or fewer mounts.)
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namespace.c | 205 ++++++++++++++++++++++++++++++++++++++---
lxc-dave/include/linux/mount.h | 8 +
2 files changed, 198 insertions(+), 15 deletions(-)
diff -puN fs/namespace.c~numa_mnt_want_write fs/namespace.c
--- lxc/fs/namespace.c~numa_mnt_want_write 2007-09-20 12:16:23.000000000 -0700
+++ lxc-dave/fs/namespace.c 2007-09-20 12:16:23.000000000 -0700
@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -52,6 +53,8 @@ static inline unsigned long hash(struct
return tmp & hash_mask;
}
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -65,6 +68,7 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -85,6 +89,84 @@ struct vfsmount *alloc_vfsmnt(const char
* we can determine when writes are able to occur to
* a filesystem.
*/
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*. This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+struct mnt_writer {
+ /*
+ * If holding multiple instances of this lock, they
+ * must be ordered by cpu number.
+ */
+ spinlock_t lock;
+ unsigned long count;
+ struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+ spin_lock_init(&writer->lock);
+ writer->count = 0;
+ }
+ return 0;
+}
+fs_initcall(init_mnt_writers);
+
+static void mnt_unlock_cpus(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_unlock(&cpu_writer->lock);
+ }
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+ if (!cpu_writer->mnt)
+ return;
+ atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+ cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+ struct vfsmount *mnt)
+{
+ if (cpu_writer->mnt == mnt)
+ return;
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = mnt;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
/**
* mnt_want_write - get write access to a mount
* @mnt: the mount on which to take a write
@@ -97,12 +179,58 @@ struct vfsmount *alloc_vfsmnt(const char
*/
int mnt_want_write(struct vfsmount *mnt)
{
- if (__mnt_is_readonly(mnt))
- return -EROFS;
- return 0;
+ int ret = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ cpu_writer->count++;
+out:
+ spin_unlock(&cpu_writer->lock);
+ put_cpu_var(mnt_writers);
+ return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
+static void lock_and_coalesce_cpu_mnt_writer_counts(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_lock(&cpu_writer->lock);
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = NULL;
+ }
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count. Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+ while (atomic_read(&mnt->__mnt_writers) <
+ MNT_WRITER_UNDERFLOW_LIMIT) {
+ /*
+ * It isn't necessary to hold all of the locks
+ * at the same time, but doing it this way makes
+ * us share a lot more code.
+ */
+ lock_and_coalesce_cpu_mnt_writer_counts();
+ mnt_unlock_cpus();
+ }
+}
+
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
@@ -113,23 +241,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
*/
void mnt_drop_write(struct vfsmount *mnt)
{
+ int must_check_underflow = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ if (cpu_writer->count > 0) {
+ cpu_writer->count--;
+ } else {
+ must_check_underflow = 1;
+ atomic_dec(&mnt->__mnt_writers);
+ }
+
+ spin_unlock(&cpu_writer->lock);
+ /*
+ * Logically, we could call this each time,
+ * but the __mnt_writers cacheline tends to
+ * be cold, and makes this expensive.
+ */
+ if (must_check_underflow)
+ handle_write_count_underflow(mnt);
+ /*
+ * This could be done right after the spinlock
+ * is taken because the spinlock keeps us on
+ * the cpu, and disables preemption. However,
+ * putting it here bounds the amount that
+ * __mnt_writers can underflow. Without it,
+ * we could theoretically wrap __mnt_writers.
+ */
+ put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-/*
- * __mnt_is_readonly: check whether a mount is read-only
- * @mnt: the mount to check for its write status
- *
- * This shouldn't be used directly ouside of the VFS.
- * It does not guarantee that the filesystem will stay
- * r/w, just that it is right *now*. This can not and
- * should not be used in place of IS_RDONLY(inode).
- */
-int __mnt_is_readonly(struct vfsmount *mnt)
+int mnt_make_readonly(struct vfsmount *mnt)
{
- return (mnt->mnt_sb->s_flags & MS_RDONLY);
+ int ret = 0;
+
+ lock_and_coalesce_cpu_mnt_writer_counts();
+ /*
+ * With all the locks held, this value is stable
+ */
+ if (atomic_read(&mnt->__mnt_writers) > 0) {
+ ret = -EBUSY;
+ goto out;
+ }
+ /*
+ * actually set mount's r/o flag here to make
+ * __mnt_is_readonly() true, which keeps anyone
+ * from doing a successful mnt_want_write().
+ */
+out:
+ mnt_unlock_cpus();
+ return ret;
}
-EXPORT_SYMBOL_GPL(__mnt_is_readonly);
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
@@ -325,6 +491,15 @@ static struct vfsmount *clone_mnt(struct
static inline void __mntput(struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
+ lock_and_coalesce_cpu_mnt_writer_counts();
+ mnt_unlock_cpus();
+ /*
+ * This probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this
+ * happens, the filesystem was probably unable
+ * to make r/w->r/o transitions.
+ */
+ WARN_ON(atomic_read(&mnt->__mnt_writers));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
diff -puN include/linux/mount.h~numa_mnt_want_write include/linux/mount.h
--- lxc/include/linux/mount.h~numa_mnt_want_write 2007-09-20 12:16:23.000000000 -0700
+++ lxc-dave/include/linux/mount.h 2007-09-20 12:16:23.000000000 -0700
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>
@@ -61,6 +62,13 @@ struct vfsmount {
atomic_t mnt_count;
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
+ /*
+ * This value is not stable unless all of the
+ * mnt_writers[] spinlocks are held, and all
+ * mnt_writer[]s on this mount have 0 as
+ * their ->count
+ */
+ atomic_t __mnt_writers;
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
_
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-20 19:53 ` [PATCH 24/25] r/o bind mounts: track number of mount writers Dave Hansen
@ 2007-09-24 6:17 ` Andrew Morton
2007-09-24 14:34 ` Arjan van de Ven
2007-09-24 22:06 ` Dave Hansen
2007-09-24 17:54 ` Christoph Hellwig
1 sibling, 2 replies; 48+ messages in thread
From: Andrew Morton @ 2007-09-24 6:17 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Thu, 20 Sep 2007 12:53:20 -0700 Dave Hansen <haveblue@us.ibm.com> wrote:
> This is the real meat of the entire series. It actually
> implements the tracking of the number of writers to a mount.
> However, it causes scalability problems because there can
> be hundreds of cpus doing open()/close() on files on the
> same mnt at the same time. Even an atomic_t in the mnt
> has massive scalaing problems because the cacheline gets
> so terribly contended.
>
> This uses a statically-allocated percpu variable. All
> operations are local to a cpu as long that cpu operates on
> the same mount, and there are no writer count imbalances.
> Writer count imbalances happen when a write is taken on one
> cpu, and released on another, like when an open/close pair
> is performed on two different cpus because the task moved.
Did you test with lockdep enabled?
=============================================
[ INFO: possible recursive locking detected ]
2.6.23-rc7-mm1 #1
---------------------------------------------
swapper/1 is trying to acquire lock:
(&writer->lock){--..}, at: [<c0197a32>] lock_and_coalesce_cpu_mnt_writer_counts+0x32/0x70
but task is already holding lock:
(&writer->lock){--..}, at: [<c0197a32>] lock_and_coalesce_cpu_mnt_writer_counts+0x32/0x70
other info that might help us debug this:
1 lock held by swapper/1:
#0: (&writer->lock){--..}, at: [<c0197a32>] lock_and_coalesce_cpu_mnt_writer_counts+0x32/0x70
stack backtrace:
[<c0103ffa>] show_trace_log_lvl+0x1a/0x30
[<c0104b82>] show_trace+0x12/0x20
[<c0104c96>] dump_stack+0x16/0x20
[<c0144dc5>] __lock_acquire+0xde5/0x10a0
[<c01450fa>] lock_acquire+0x7a/0xa0
[<c03e734c>] _spin_lock+0x2c/0x40
[<c0197a32>] lock_and_coalesce_cpu_mnt_writer_counts+0x32/0x70
[<c01982c6>] mntput_no_expire+0x36/0xc0
[<c0188f15>] path_release_on_umount+0x15/0x20
[<c0198930>] sys_umount+0x40/0x230
[<c010070b>] name_to_dev_t+0x9b/0x270
[<c05230c2>] prepare_namespace+0x62/0x1b0
[<c05226ca>] kernel_init+0x21a/0x320
[<c0103b47>] kernel_thread_helper+0x7/0x10
=======================
It look like a false positive to me, but really, for a patchset of this
complexity and maturity I cannot fathom how it could have escaped any
lockdep testing.
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 6:17 ` Andrew Morton
@ 2007-09-24 14:34 ` Arjan van de Ven
2007-09-24 22:06 ` Dave Hansen
1 sibling, 0 replies; 48+ messages in thread
From: Arjan van de Ven @ 2007-09-24 14:34 UTC (permalink / raw)
To: Andrew Morton; +Cc: Dave Hansen, linux-kernel, hch
> It look like a false positive to me, but really, for a patchset of
> this complexity and maturity I cannot fathom how it could have
> escaped any lockdep testing.
the code tries to implement per cpu spinlocks, or rather it tries
to bring back the brlocks from way past.... cute.
we can educate lockdep about this quite easily; but isn't there some
primitive already in existence that we can use?
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 6:17 ` Andrew Morton
2007-09-24 14:34 ` Arjan van de Ven
@ 2007-09-24 22:06 ` Dave Hansen
2007-09-24 22:25 ` Andrew Morton
1 sibling, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-24 22:06 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch
On Sun, 2007-09-23 at 23:17 -0700, Andrew Morton wrote:
> It look like a false positive to me, but really, for a patchset of this
> complexity and maturity I cannot fathom how it could have escaped any
> lockdep testing.
I test with lockdep all the time. The problem was that lockdep doesn't
complain until you have 8 nested locks, and I only tested on a 4-cpu
system.
I lowered the lockdep nesting limit to 3, and got the warning on my
machine.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 22:06 ` Dave Hansen
@ 2007-09-24 22:25 ` Andrew Morton
2007-09-24 23:05 ` Dave Hansen
0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-09-24 22:25 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Mon, 24 Sep 2007 15:06:42 -0700
Dave Hansen <haveblue@us.ibm.com> wrote:
> On Sun, 2007-09-23 at 23:17 -0700, Andrew Morton wrote:
> > It look like a false positive to me, but really, for a patchset of this
> > complexity and maturity I cannot fathom how it could have escaped any
> > lockdep testing.
>
> I test with lockdep all the time. The problem was that lockdep doesn't
> complain until you have 8 nested locks, and I only tested on a 4-cpu
> system.
>
> I lowered the lockdep nesting limit to 3, and got the warning on my
> machine.
>
hm. I saw that warning on my 2-way. It has CONFIG_NR_CPUS=8 so perhaps
the kernel has decided that this machine can possibly have eight CPUs.
It's an old super-micro board, doesn't have ACPI.
OEM ID: INTEL Product ID: 440BX APIC at: 0xFEE00000
Processor #0 6:8 APIC version 17
Processor #1 6:8 APIC version 17
I/O APIC #2 Version 17 at 0xFEC00000.
Enabling APIC mode: Flat. Using 1 I/O APICs
Processors: 2
...
Checking if this processor honours the WP bit even in supervisor mode... Ok.
Calibrating delay using timer specific routine.. 1707.03 BogoMIPS (lpj=3414067)
Mount-cache hash table entries: 512
CPU: After generic identify, caps: 0383fbff 00000000 00000000 00000000 00000000 00000000 00000000 00000000
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 256K
CPU: After all inits, caps: 0383fbff 00000000 00000000 00000040 00000000 00000000 00000000 00000000
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#0.
Compat vDSO mapped to ffffe000.
Checking 'hlt' instruction... OK.
lockdep: not fixing up alternatives.
CPU0: Intel Pentium III (Coppermine) stepping 03
lockdep: not fixing up alternatives.
Booting processor 1/1 eip 2000
Initializing CPU#1
Calibrating delay using timer specific routine.. 1704.02 BogoMIPS (lpj=3408054)
CPU: After generic identify, caps: 0383fbff 00000000 00000000 00000000 00000000 00000000 00000000 00000000
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 256K
CPU: After all inits, caps: 0383fbff 00000000 00000000 00000040 00000000 00000000 00000000 00000000
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#1.
CPU1: Intel Pentium III (Coppermine) stepping 03
Total of 2 processors activated (3411.06 BogoMIPS).
ExtINT not setup in hardware but reported by MP table
ENABLING IO-APIC IRQs
..TIMER: vector=0x31 apic1=0 pin1=2 apic2=0 pin2=0
APIC timer registered as dummy, due to nmi_watchdog=1!
checking TSC synchronization [CPU#0 -> CPU#1]: passed.
Brought up 2 CPUs
One would think that the kernel would work out that eight CPUs ain't
possible on such a machine. But we don't seem to print out any info which
allows me to confirm that this is really what the kernel did.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 22:25 ` Andrew Morton
@ 2007-09-24 23:05 ` Dave Hansen
2007-09-24 23:15 ` Andrew Morton
0 siblings, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-24 23:05 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch
On Mon, 2007-09-24 at 15:25 -0700, Andrew Morton wrote:
> hm. I saw that warning on my 2-way. It has CONFIG_NR_CPUS=8 so perhaps
> the kernel has decided that this machine can possibly have eight CPUs.
>
> It's an old super-micro board, doesn't have ACPI.
Well, it's looking like we only set cpu_possible_map from data we find
in the MP table, which makes sense. The only question is how your
system gets more than ~8 possible cpus. Do you have the .config handy?
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 23:05 ` Dave Hansen
@ 2007-09-24 23:15 ` Andrew Morton
2007-09-25 16:10 ` Dave Hansen
0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-09-24 23:15 UTC (permalink / raw)
To: Dave Hansen; +Cc: linux-kernel, hch
On Mon, 24 Sep 2007 16:05:37 -0700
Dave Hansen <haveblue@us.ibm.com> wrote:
> On Mon, 2007-09-24 at 15:25 -0700, Andrew Morton wrote:
> > hm. I saw that warning on my 2-way. It has CONFIG_NR_CPUS=8 so perhaps
> > the kernel has decided that this machine can possibly have eight CPUs.
> >
> > It's an old super-micro board, doesn't have ACPI.
>
> Well, it's looking like we only set cpu_possible_map from data we find
> in the MP table, which makes sense. The only question is how your
> system gets more than ~8 possible cpus. Do you have the .config handy?
>
http://userweb.kernel.org/~akpm/config-vmm.txt
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 23:15 ` Andrew Morton
@ 2007-09-25 16:10 ` Dave Hansen
0 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-25 16:10 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, hch
On Mon, 2007-09-24 at 16:15 -0700, Andrew Morton wrote:
> On Mon, 24 Sep 2007 16:05:37 -0700
> Dave Hansen <haveblue@us.ibm.com> wrote:
>
> > On Mon, 2007-09-24 at 15:25 -0700, Andrew Morton wrote:
> > > hm. I saw that warning on my 2-way. It has CONFIG_NR_CPUS=8 so perhaps
> > > the kernel has decided that this machine can possibly have eight CPUs.
> > >
> > > It's an old super-micro board, doesn't have ACPI.
> >
> > Well, it's looking like we only set cpu_possible_map from data we find
> > in the MP table, which makes sense. The only question is how your
> > system gets more than ~8 possible cpus. Do you have the .config handy?
>
> http://userweb.kernel.org/~akpm/config-vmm.txt
I've reproduced this. I'll do more runtime testing with all of the
debugging, CONFIG_CPU_HOTPLUG=y, and NR_CPUS to a high value. It should
help catch things like this in the future.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-20 19:53 ` [PATCH 24/25] r/o bind mounts: track number of mount writers Dave Hansen
2007-09-24 6:17 ` Andrew Morton
@ 2007-09-24 17:54 ` Christoph Hellwig
2007-09-24 19:10 ` Andrew Morton
2007-09-24 19:28 ` Dave Hansen
1 sibling, 2 replies; 48+ messages in thread
From: Christoph Hellwig @ 2007-09-24 17:54 UTC (permalink / raw)
To: Dave Hansen; +Cc: akpm, linux-kernel, hch
As we already say in various messages the percpu counters in here
look rather fishy. I'd recomment to take a look at the per-cpu
superblock counters in XFS as they've been debugged quite well
now and could probably be lifted into a generic library for this
kind of think. The code is mostly in fs/xfs/xfs_mount.c can
can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
It also handles cases like hotplug cpu nicely that this code
seems to work around by always iterating over all possible cpus
which might not be nice on a dual core laptop with a distro kernel
that also has to support big iron.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 17:54 ` Christoph Hellwig
@ 2007-09-24 19:10 ` Andrew Morton
2007-09-24 19:24 ` Christoph Hellwig
2007-09-24 19:28 ` Dave Hansen
1 sibling, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-09-24 19:10 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Dave Hansen, linux-kernel
On Mon, 24 Sep 2007 18:54:11 +0100
Christoph Hellwig <hch@infradead.org> wrote:
> As we already say in various messages the percpu counters in here
> look rather fishy. I'd recomment to take a look at the per-cpu
> superblock counters in XFS as they've been debugged quite well
> now and could probably be lifted into a generic library for this
> kind of think. The code is mostly in fs/xfs/xfs_mount.c can
> can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
>
> It also handles cases like hotplug cpu nicely that this code
> seems to work around by always iterating over all possible cpus
> which might not be nice on a dual core laptop with a distro kernel
> that also has to support big iron.
hm. How come xfs invented a new version of percpu_counters?
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 19:10 ` Andrew Morton
@ 2007-09-24 19:24 ` Christoph Hellwig
0 siblings, 0 replies; 48+ messages in thread
From: Christoph Hellwig @ 2007-09-24 19:24 UTC (permalink / raw)
To: Andrew Morton; +Cc: Christoph Hellwig, Dave Hansen, linux-kernel, xfs
On Mon, Sep 24, 2007 at 12:10:35PM -0700, Andrew Morton wrote:
> > As we already say in various messages the percpu counters in here
> > look rather fishy. I'd recomment to take a look at the per-cpu
> > superblock counters in XFS as they've been debugged quite well
> > now and could probably be lifted into a generic library for this
> > kind of think. The code is mostly in fs/xfs/xfs_mount.c can
> > can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
> >
> > It also handles cases like hotplug cpu nicely that this code
> > seems to work around by always iterating over all possible cpus
> > which might not be nice on a dual core laptop with a distro kernel
> > that also has to support big iron.
>
> hm. How come xfs invented a new version of percpu_counters?
This code actually predates the generic percpu_counters even if it
was merged to mainline later. Neither Dave who wrote it nor me
who reviewed it before it was merged thought of percpu_counters
probably. Then again this code is considerably more complex due
to features actually needed in a very hot fastpath.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 17:54 ` Christoph Hellwig
2007-09-24 19:10 ` Andrew Morton
@ 2007-09-24 19:28 ` Dave Hansen
2007-09-24 19:42 ` Andrew Morton
1 sibling, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2007-09-24 19:28 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: akpm, linux-kernel
On Mon, 2007-09-24 at 18:54 +0100, Christoph Hellwig wrote:
> As we already say in various messages the percpu counters in here
> look rather fishy. I'd recomment to take a look at the per-cpu
> superblock counters in XFS as they've been debugged quite well
> now and could probably be lifted into a generic library for this
> kind of think. The code is mostly in fs/xfs/xfs_mount.c can
> can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
>
> It also handles cases like hotplug cpu nicely that this code
> seems to work around by always iterating over all possible cpus
> which might not be nice on a dual core laptop with a distro kernel
> that also has to support big iron.
I'll take a look at xfs to see what I can get out of it.
There are basically two times when you have to do this
for_each_possible_cpu() stuff:
1. when doing a r/w->r/o transition, which is rare, and
certainly not a fast path
2. Where the per-cpu writer count underflows. This requires
a _minimum_ of 1<<16 file opens (configurable) each of which
is closed on a different cpu than it was opened on. Even
if you were trying, I'm not sure you'd notice the overhead.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 19:28 ` Dave Hansen
@ 2007-09-24 19:42 ` Andrew Morton
2007-10-01 18:06 ` Dave Hansen
0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2007-09-24 19:42 UTC (permalink / raw)
To: Dave Hansen; +Cc: Christoph Hellwig, linux-kernel
On Mon, 24 Sep 2007 12:28:11 -0700
Dave Hansen <haveblue@us.ibm.com> wrote:
> On Mon, 2007-09-24 at 18:54 +0100, Christoph Hellwig wrote:
> > As we already say in various messages the percpu counters in here
> > look rather fishy. I'd recomment to take a look at the per-cpu
> > superblock counters in XFS as they've been debugged quite well
> > now and could probably be lifted into a generic library for this
> > kind of think. The code is mostly in fs/xfs/xfs_mount.c can
> > can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
> >
> > It also handles cases like hotplug cpu nicely that this code
> > seems to work around by always iterating over all possible cpus
> > which might not be nice on a dual core laptop with a distro kernel
> > that also has to support big iron.
>
> I'll take a look at xfs to see what I can get out of it.
And at include/linux/percpu_counter.h, please.
> There are basically two times when you have to do this
> for_each_possible_cpu() stuff:
> 1. when doing a r/w->r/o transition, which is rare, and
> certainly not a fast path
> 2. Where the per-cpu writer count underflows. This requires
> a _minimum_ of 1<<16 file opens (configurable) each of which
> is closed on a different cpu than it was opened on. Even
> if you were trying, I'm not sure you'd notice the overhead.
>
Sounds like what you're doing is more akin to the local_t-based module
refcounting. `grep local_ kernel/module.c'.
That code should be converted from NR_CPUS to for_each_possible_cpu()..
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH 24/25] r/o bind mounts: track number of mount writers
2007-09-24 19:42 ` Andrew Morton
@ 2007-10-01 18:06 ` Dave Hansen
0 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-10-01 18:06 UTC (permalink / raw)
To: Andrew Morton; +Cc: Christoph Hellwig, linux-kernel
On Mon, 2007-09-24 at 12:42 -0700, Andrew Morton wrote:
> On Mon, 24 Sep 2007 12:28:11 -0700
> Dave Hansen <haveblue@us.ibm.com> wrote:
> > On Mon, 2007-09-24 at 18:54 +0100, Christoph Hellwig wrote:
> > > As we already say in various messages the percpu counters in here
> > > look rather fishy. I'd recomment to take a look at the per-cpu
> > > superblock counters in XFS as they've been debugged quite well
> > > now and could probably be lifted into a generic library for this
> > > kind of think. The code is mostly in fs/xfs/xfs_mount.c can
> > > can be spotted by beeing under #ifdef HAVE_PERCPU_SB.
> > >
> > > It also handles cases like hotplug cpu nicely that this code
> > > seems to work around by always iterating over all possible cpus
> > > which might not be nice on a dual core laptop with a distro kernel
> > > that also has to support big iron.
> >
> > I'll take a look at xfs to see what I can get out of it.
>
> And at include/linux/percpu_counter.h, please.
The basic incompatibility with what that provides and what I need is
that percpu_counters allow some fuzziness in the numbers. One cpu can
be summing the numbers up while another is still adding to the local
percpu counters. That's fine for statistics, but horrible for questions
like, "can anybody write to and corrupt my FS right now?"
It could probably be modified to do what I want, but it would still have
a the "invented your own lock" problem, and would likely impact the
scalability of the existing "fuzzy" users.
We could introduce fuzzy and coherent variants of the function calls,
but that would probably introduce more code than what I have now for the
very specific mnt_writer_lock.
> > There are basically two times when you have to do this
> > for_each_possible_cpu() stuff:
> > 1. when doing a r/w->r/o transition, which is rare, and
> > certainly not a fast path
> > 2. Where the per-cpu writer count underflows. This requires
> > a _minimum_ of 1<<16 file opens (configurable) each of which
> > is closed on a different cpu than it was opened on. Even
> > if you were trying, I'm not sure you'd notice the overhead.
> >
>
> Sounds like what you're doing is more akin to the local_t-based module
> refcounting. `grep local_ kernel/module.c'.
>
> That code should be converted from NR_CPUS to for_each_possible_cpu()..
I think that can get converted to use the percpu_counters pretty easily.
I've coded that up, and sent it [RFC] to lkml. Rusty will forward on
into mainline.
-- Dave
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH 25/25] honor r/w changes at do_remount() time
2007-09-20 19:52 [PATCH 00/25] Read-only bind mounts Dave Hansen
` (23 preceding siblings ...)
2007-09-20 19:53 ` [PATCH 24/25] r/o bind mounts: track number of mount writers Dave Hansen
@ 2007-09-20 19:53 ` Dave Hansen
24 siblings, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2007-09-20 19:53 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, hch, Dave Hansen
Originally from: Herbert Poetzl <herbert@13thfloor.at>
This is the core of the read-only bind mount patch set.
Note that this does _not_ add a "ro" option directly to
the bind mount operation. If you require such a mount,
you must first do the bind, then follow it up with a
'mount -o remount,ro' operation.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namespace.c | 46 ++++++++++++++++++++++++++++++++++-------
lxc-dave/include/linux/mount.h | 1
2 files changed, 40 insertions(+), 7 deletions(-)
diff -puN fs/namespace.c~honor-r-w-changes-at-do-remount-time fs/namespace.c
--- lxc/fs/namespace.c~honor-r-w-changes-at-do-remount-time 2007-09-20 12:16:23.000000000 -0700
+++ lxc-dave/fs/namespace.c 2007-09-20 12:16:23.000000000 -0700
@@ -102,7 +102,11 @@ struct vfsmount *alloc_vfsmnt(const char
*/
int __mnt_is_readonly(struct vfsmount *mnt)
{
- return (mnt->mnt_sb->s_flags & MS_RDONLY);
+ if (mnt->mnt_flags & MNT_READONLY)
+ return 1;
+ if (mnt->mnt_sb->s_flags & MS_RDONLY)
+ return 1;
+ return 0;
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
@@ -275,7 +279,7 @@ void mnt_drop_write(struct vfsmount *mnt
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-int mnt_make_readonly(struct vfsmount *mnt)
+static int mnt_make_readonly(struct vfsmount *mnt)
{
int ret = 0;
@@ -288,15 +292,21 @@ int mnt_make_readonly(struct vfsmount *m
goto out;
}
/*
- * actually set mount's r/o flag here to make
- * __mnt_is_readonly() true, which keeps anyone
- * from doing a successful mnt_want_write().
+ * nobody can do a successful mnt_want_write() with all
+ * of the counts in MNT_DENIED_WRITE and the locks held.
*/
+ if (!ret)
+ mnt->mnt_flags |= MNT_READONLY;
out:
mnt_unlock_cpus();
return ret;
}
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+ mnt->mnt_flags &= ~MNT_READONLY;
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -605,7 +615,7 @@ static int show_vfsmnt(struct seq_file *
seq_putc(m, '.');
mangle(m, mnt->mnt_sb->s_subtype);
}
- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+ seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
if (mnt->mnt_sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
@@ -1173,6 +1183,23 @@ out:
return err;
}
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+ int error = 0;
+ int readonly_request = 0;
+
+ if (ms_flags & MS_RDONLY)
+ readonly_request = 1;
+ if (readonly_request == __mnt_is_readonly(mnt))
+ return 0;
+
+ if (readonly_request)
+ error = mnt_make_readonly(mnt);
+ else
+ __mnt_unmake_readonly(mnt);
+ return error;
+}
+
/*
* change filesystem flags. dir should be a physical root of filesystem.
* If you've mounted a non-root directory somewhere and want to do remount
@@ -1194,7 +1221,10 @@ static int do_remount(struct nameidata *
return -EINVAL;
down_write(&sb->s_umount);
- err = do_remount_sb(sb, flags, data, 0);
+ if (flags & MS_BIND)
+ err = change_mount_flags(nd->mnt, flags);
+ else
+ err = do_remount_sb(sb, flags, data, 0);
if (!err)
nd->mnt->mnt_flags = mnt_flags;
up_write(&sb->s_umount);
@@ -1638,6 +1668,8 @@ long do_mount(char *dev_name, char *dir_
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME);
diff -puN include/linux/mount.h~honor-r-w-changes-at-do-remount-time include/linux/mount.h
--- lxc/include/linux/mount.h~honor-r-w-changes-at-do-remount-time 2007-09-20 12:16:23.000000000 -0700
+++ lxc-dave/include/linux/mount.h 2007-09-20 12:16:23.000000000 -0700
@@ -29,6 +29,7 @@ struct mnt_namespace;
#define MNT_NOATIME 0x08
#define MNT_NODIRATIME 0x10
#define MNT_RELATIME 0x20
+#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
#define MNT_SHRINKABLE 0x100
_
^ permalink raw reply [flat|nested] 48+ messages in thread