* [RFC][PATCH 2/8] move mnt_want_write() into open_namei_create()
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 3/8] move mnt_want_write() out of may_open() Dave Hansen
` (5 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
In a moment, we're going to make may_open() stop doing
the mnt_want/drop_write() pair. Doing this first makes
the next patch simpler.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff -puN fs/namei.c~move-mnt_want_write-into-open_namei_create fs/namei.c
--- lxc/fs/namei.c~move-mnt_want_write-into-open_namei_create 2007-09-28 11:03:19.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:19.000000000 -0700
@@ -1686,6 +1686,19 @@ static int open_namei_create(struct name
int error;
struct dentry *dir = nd->dentry;
+ /*
+ * This ensures that the mnt stays writable
+ * over the vfs_create() call to may_open(),
+ * which takes a more persistent
+ * mnt_want_write().
+ */
+ error = mnt_want_write(nd->mnt);
+ if (error) {
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(nd->dentry);
+ return error;
+ }
+
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
error = vfs_create(dir->d_inode, path->dentry, mode, nd);
@@ -1693,9 +1706,16 @@ static int open_namei_create(struct name
dput(nd->dentry);
nd->dentry = path->dentry;
if (error)
- return error;
+ goto out;
/* Don't check for write permission, don't truncate */
- return may_open(nd, 0, flag & ~O_TRUNC);
+ error = may_open(nd, 0, flag & ~O_TRUNC);
+out:
+ /*
+ * if needed, may_open() has taken a write
+ * on the mnt for us, so we can release ours.
+ */
+ mnt_drop_write(nd->mnt);
+ return error;
}
/*
@@ -1778,11 +1798,7 @@ do_last:
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = mnt_want_write(nd->mnt);
- if (error)
- goto exit_mutex_unlock;
error = open_namei_create(nd, &path, flag, mode);
- mnt_drop_write(nd->mnt);
if (error)
goto exit;
return 0;
_
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC][PATCH 3/8] move mnt_want_write() out of may_open()
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 2/8] move mnt_want_write() into open_namei_create() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-10-01 19:55 ` Miklos Szeredi
2007-09-28 18:13 ` [RFC][PATCH 4/8] do namei_flags calculation inside open_namei() Dave Hansen
` (4 subsequent siblings)
6 siblings, 1 reply; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
may_open() can fail in a lot of ways. It is also named
such that it doesn't appear to be _taking_ action, just
checking a bunch of conditions.
So, it makes a poor place to take and release the mnt
writer count. This moves the burder of taking the
mnt writer counts into the callers. The callers have
the advantage of much more visibility of the filp.
Since we rely on the __fput(filp) to release the mnt
writer count, this is a good thing.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 41 +++++++++++++++++++++++------------------
lxc-dave/fs/nfsctl.c | 20 ++++++++++++++++++--
2 files changed, 41 insertions(+), 20 deletions(-)
diff -puN fs/namei.c~move-mnt_want_write-out-of-may_open fs/namei.c
--- lxc/fs/namei.c~move-mnt_want_write-out-of-may_open 2007-09-28 11:03:20.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:20.000000000 -0700
@@ -1621,14 +1621,6 @@ int may_open(struct nameidata *nd, int a
return -EACCES;
flag &= ~O_TRUNC;
- } else if (flag & FMODE_WRITE) {
- /*
- * effectively: !special_file()
- * balanced by __fput()
- */
- error = mnt_want_write(nd->mnt);
- if (error)
- return error;
}
error = vfs_permission(nd, acc_mode);
@@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
struct dentry *dir = nd->dentry;
/*
- * This ensures that the mnt stays writable
- * over the vfs_create() call to may_open(),
- * which takes a more persistent
- * mnt_want_write().
+ * This mnt_want_write() is potentially persistent,
+ * and balanced in __fput()
*/
error = mnt_want_write(nd->mnt);
if (error) {
@@ -1710,14 +1700,18 @@ static int open_namei_create(struct name
/* Don't check for write permission, don't truncate */
error = may_open(nd, 0, flag & ~O_TRUNC);
out:
- /*
- * if needed, may_open() has taken a write
- * on the mnt for us, so we can release ours.
- */
- mnt_drop_write(nd->mnt);
+ if (error)
+ mnt_drop_write(nd->mnt);
return error;
}
+static inline int write_may_occur_to_file(struct inode *inode, int acc_mode)
+{
+ if (special_file(inode->i_mode))
+ return 0;
+ return (acc_mode & FMODE_WRITE);
+}
+
/*
* open_namei()
*
@@ -1831,9 +1825,20 @@ do_last:
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
ok:
- error = may_open(nd, acc_mode, flag);
+ /*
+ * This mnt_want_write() is potentially persistent,
+ * and balanced in __fput()
+ */
+ if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
+ error = mnt_want_write(nd->mnt);
if (error)
goto exit;
+ error = may_open(nd, acc_mode, flag);
+ if (error) {
+ if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
+ mnt_drop_write(nd->mnt);
+ goto exit;
+ }
return 0;
exit_mutex_unlock:
diff -puN fs/nfsctl.c~move-mnt_want_write-out-of-may_open fs/nfsctl.c
--- lxc/fs/nfsctl.c~move-mnt_want_write-out-of-may_open 2007-09-28 11:03:20.000000000 -0700
+++ lxc-dave/fs/nfsctl.c 2007-09-28 11:03:20.000000000 -0700
@@ -22,6 +22,7 @@
static struct file *do_open(char *name, int flags)
{
+ struct file *filp;
struct nameidata nd;
struct vfsmount *mnt;
int error;
@@ -35,14 +36,29 @@ static struct file *do_open(char *name,
if (error)
return ERR_PTR(error);
+ /*
+ * This is balanced when you __fput() the 'struct file'
+ * returned by dentry_open() below. If the may_open()
+ * ever stops containing FMODE_WRITE, this needs to
+ * be changed.
+ */
+ error = mnt_want_write(mnt);
+ if (error)
+ return ERR_PTR(error);
if (flags == O_RDWR)
error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
else
error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
- if (!error)
- return dentry_open(nd.dentry, nd.mnt, flags);
+ if (error)
+ goto out;
+ filp = dentry_open(nd.dentry, nd.mnt, flags);
+ if (!IS_ERR(filp))
+ return filp;
+ error = PTR_ERR(error);
+out:
+ mnt_drop_write(mnt);
path_release(&nd);
return ERR_PTR(error);
}
_
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [RFC][PATCH 3/8] move mnt_want_write() out of may_open()
2007-09-28 18:13 ` [RFC][PATCH 3/8] move mnt_want_write() out of may_open() Dave Hansen
@ 2007-10-01 19:55 ` Miklos Szeredi
2007-10-01 20:10 ` Dave Hansen
0 siblings, 1 reply; 10+ messages in thread
From: Miklos Szeredi @ 2007-10-01 19:55 UTC (permalink / raw)
To: haveblue; +Cc: linux-kernel, hch, miklos, haveblue
> @@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
> struct dentry *dir = nd->dentry;
>
> /*
> - * This ensures that the mnt stays writable
> - * over the vfs_create() call to may_open(),
> - * which takes a more persistent
> - * mnt_want_write().
> + * This mnt_want_write() is potentially persistent,
> + * and balanced in __fput()
> */
> error = mnt_want_write(nd->mnt);
> if (error) {
I'm confused: isn't it the mnt_want_write() in __dentry_open(), that
is balanced in __fput()?
Miklos
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [RFC][PATCH 3/8] move mnt_want_write() out of may_open()
2007-10-01 19:55 ` Miklos Szeredi
@ 2007-10-01 20:10 ` Dave Hansen
0 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-10-01 20:10 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: linux-kernel, hch
On Mon, 2007-10-01 at 21:55 +0200, Miklos Szeredi wrote:
> > @@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
> > struct dentry *dir = nd->dentry;
> >
> > /*
> > - * This ensures that the mnt stays writable
> > - * over the vfs_create() call to may_open(),
> > - * which takes a more persistent
> > - * mnt_want_write().
> > + * This mnt_want_write() is potentially persistent,
> > + * and balanced in __fput()
> > */
> > error = mnt_want_write(nd->mnt);
> > if (error) {
>
> I'm confused: isn't it the mnt_want_write() in __dentry_open(), that
> is balanced in __fput()?
This is broken. I didn't realize that nameidata_to_filp() called
dentry_open. I'll rework these.
-- Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC][PATCH 4/8] do namei_flags calculation inside open_namei()
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 2/8] move mnt_want_write() into open_namei_create() Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 3/8] move mnt_want_write() out of may_open() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 5/8] make open_namei() return a filp Dave Hansen
` (3 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
My end goal here is to make sure all users of may_open()
return filps. This will ensure that we properly release
mount write counts which were taken for the filp in
may_open().
This patch moves the sys_open flags to namei flags
calculation into fs/namei.c. We'll shortly be moving
the nameidata_to_filp() calls into namei.c, and this
gets the sys_open flags to a place where we can get
at them when we need them.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 43 ++++++++++++++++++++++++++++++++++---------
lxc-dave/fs/open.c | 22 ++--------------------
2 files changed, 36 insertions(+), 29 deletions(-)
diff -puN fs/namei.c~do-namei_flags-calculation-inside-open_namei fs/namei.c
--- lxc/fs/namei.c~do-namei_flags-calculation-inside-open_namei 2007-09-28 11:03:21.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:21.000000000 -0700
@@ -1672,7 +1672,12 @@ int may_open(struct nameidata *nd, int a
return 0;
}
-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function. Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
int flag, int mode)
{
int error;
@@ -1713,26 +1718,46 @@ static inline int write_may_occur_to_fil
}
/*
+ * Note that while the flag value (low two bits) for sys_open means:
+ * 00 - read-only
+ * 01 - write-only
+ * 10 - read-write
+ * 11 - special
+ * it is changed into
+ * 00 - no permissions needed
+ * 01 - read-permission
+ * 10 - write-permission
+ * 11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
+ *
+*/
+static inline int sys_open_flags_to_namei_flags(int flag)
+{
+ if ((flag+1) & O_ACCMODE)
+ flag++;
+ return flag;
+}
+
+/*
* open_namei()
*
* namei for open - this is in fact almost the whole open-routine.
*
* Note that the low bits of "flag" aren't the same as in the open
- * system call - they are 00 - no permissions needed
- * 01 - read permission needed
- * 10 - write permission needed
- * 11 - read/write permissions needed
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
+ * system call. See sys_open_flags_to_namei_flags().
* SMP-safe
*/
-int open_namei(int dfd, const char *pathname, int flag,
+int open_namei(int dfd, const char *pathname, int sys_open_flag,
int mode, struct nameidata *nd)
{
int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
+ int flag = sys_open_flags_to_namei_flags(sys_open_flag);
acc_mode = ACC_MODE(flag);
@@ -1792,7 +1817,7 @@ do_last:
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = open_namei_create(nd, &path, flag, mode);
+ error = __open_namei_create(nd, &path, flag, mode);
if (error)
goto exit;
return 0;
diff -puN fs/open.c~do-namei_flags-calculation-inside-open_namei fs/open.c
--- lxc/fs/open.c~do-namei_flags-calculation-inside-open_namei 2007-09-28 11:03:21.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:21.000000000 -0700
@@ -840,31 +840,13 @@ cleanup_file:
return ERR_PTR(error);
}
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
- int namei_flags, error;
+ int error;
struct nameidata nd;
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
- namei_flags++;
-
- error = open_namei(dfd, filename, namei_flags, mode, &nd);
+ error = open_namei(dfd, filename, flags, mode, &nd);
if (!error)
return nameidata_to_filp(&nd, flags);
_
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC][PATCH 5/8] make open_namei() return a filp
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
` (2 preceding siblings ...)
2007-09-28 18:13 ` [RFC][PATCH 4/8] do namei_flags calculation inside open_namei() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 6/8] kill do_filp_open() Dave Hansen
` (2 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
If open_namei() succeeds, there is potentially a mnt_want_write()
that needs to get balanced. If the caller doesn't create a
'struct file' and eventually __fput() it, or manually drop the
write count on an error, we have a bug.
Forcing open_namei() to return a filp fixes this. Any caller
getting a 'struct file' back must consider that filp instantiated
and fput() it normally. The callers no longer have to worry about
ever manually releasing a mnt write count.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/namei.c | 16 ++++++++--------
lxc-dave/fs/open.c | 7 +------
lxc-dave/include/linux/fs.h | 2 +-
3 files changed, 10 insertions(+), 15 deletions(-)
diff -puN fs/namei.c~make-open_namei-return-a-filp fs/namei.c
--- lxc/fs/namei.c~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:22.000000000 -0700
@@ -1750,8 +1750,8 @@ static inline int sys_open_flags_to_name
* system call. See sys_open_flags_to_namei_flags().
* SMP-safe
*/
-int open_namei(int dfd, const char *pathname, int sys_open_flag,
- int mode, struct nameidata *nd)
+struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
+ int mode, struct nameidata *nd)
{
int acc_mode, error;
struct path path;
@@ -1777,7 +1777,7 @@ int open_namei(int dfd, const char *path
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
nd, flag);
if (error)
- return error;
+ return ERR_PTR(error);
goto ok;
}
@@ -1786,7 +1786,7 @@ int open_namei(int dfd, const char *path
*/
error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
if (error)
- return error;
+ return ERR_PTR(error);
/*
* We have the parent and last component. First of all, check
@@ -1820,7 +1820,7 @@ do_last:
error = __open_namei_create(nd, &path, flag, mode);
if (error)
goto exit;
- return 0;
+ return nameidata_to_filp(nd, sys_open_flag);
}
/*
@@ -1864,7 +1864,7 @@ ok:
mnt_drop_write(nd->mnt);
goto exit;
}
- return 0;
+ return nameidata_to_filp(nd, sys_open_flag);
exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
@@ -1874,7 +1874,7 @@ exit:
if (!IS_ERR(nd->intent.open.file))
release_open_intent(nd);
path_release(nd);
- return error;
+ return ERR_PTR(error);
do_link:
error = -ELOOP;
@@ -1901,7 +1901,7 @@ do_link:
* with "intent.open".
*/
release_open_intent(nd);
- return error;
+ return ERR_PTR(error);
}
nd->flags &= ~LOOKUP_PARENT;
if (nd->last_type == LAST_BIND)
diff -puN fs/open.c~make-open_namei-return-a-filp fs/open.c
--- lxc/fs/open.c~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:22.000000000 -0700
@@ -843,14 +843,9 @@ cleanup_file:
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
- int error;
struct nameidata nd;
- error = open_namei(dfd, filename, flags, mode, &nd);
- if (!error)
- return nameidata_to_filp(&nd, flags);
-
- return ERR_PTR(error);
+ return open_namei(dfd, filename, flags, mode, &nd);
}
struct file *filp_open(const char *filename, int flags, int mode)
diff -puN include/linux/fs.h~make-open_namei-return-a-filp include/linux/fs.h
--- lxc/include/linux/fs.h~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:22.000000000 -0700
@@ -1721,7 +1721,7 @@ extern struct file *create_read_pipe(str
extern struct file *create_write_pipe(void);
extern void free_write_pipe(struct file *);
-extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
+extern struct file *open_namei(int dfd, const char *, int, int, struct nameidata *);
extern int may_open(struct nameidata *, int, int);
extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
_
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC][PATCH 6/8] kill do_filp_open()
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
` (3 preceding siblings ...)
2007-09-28 18:13 ` [RFC][PATCH 5/8] make open_namei() return a filp Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 7/8] kill filp_open() Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 8/8] keep track of mnt_writer state of struct file Dave Hansen
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
This kills off the almost empty do_filp_open(). The indenting
change in do_sys_open() is because we would have gone over our
80 characters otherwise.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/open.c | 39 ++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)
diff -puN fs/open.c~kill-do_filp_open fs/open.c
--- lxc/fs/open.c~kill-do_filp_open 2007-09-28 11:03:24.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:24.000000000 -0700
@@ -840,17 +840,10 @@ cleanup_file:
return ERR_PTR(error);
}
-static struct file *do_filp_open(int dfd, const char *filename, int flags,
- int mode)
-{
- struct nameidata nd;
-
- return open_namei(dfd, filename, flags, mode, &nd);
-}
-
struct file *filp_open(const char *filename, int flags, int mode)
{
- return do_filp_open(AT_FDCWD, filename, flags, mode);
+ struct nameidata nd;
+ return open_namei(AT_FDCWD, filename, flags, mode, &nd);
}
EXPORT_SYMBOL(filp_open);
@@ -1049,20 +1042,24 @@ long do_sys_open(int dfd, const char __u
char *tmp = getname(filename);
int fd = PTR_ERR(tmp);
- if (!IS_ERR(tmp)) {
- fd = get_unused_fd_flags(flags);
- if (fd >= 0) {
- struct file *f = do_filp_open(dfd, tmp, flags, mode);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else {
- fsnotify_open(f->f_path.dentry);
- fd_install(fd, f);
- }
+ if (IS_ERR(tmp))
+ goto out;
+
+ fd = get_unused_fd_flags(flags);
+ if (fd >= 0) {
+ struct nameidata nd;
+ struct file *f = open_namei(dfd, tmp, flags, mode, &nd);
+
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ } else {
+ fsnotify_open(f->f_path.dentry);
+ fd_install(fd, f);
}
- putname(tmp);
}
+ putname(tmp);
+out:
return fd;
}
_
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC][PATCH 7/8] kill filp_open()
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
` (4 preceding siblings ...)
2007-09-28 18:13 ` [RFC][PATCH 6/8] kill do_filp_open() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
2007-09-28 18:13 ` [RFC][PATCH 8/8] keep track of mnt_writer state of struct file Dave Hansen
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
Replace all callers with open_namei() directly, and move the
nameidata stack allocation into open_namei().
Does it make sense to still call it open_namei(), even though it
doesn't actually take a nameidata any more?
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/drivers/usb/gadget/file_storage.c | 5 +
lxc-dave/fs/exec.c | 2
lxc-dave/fs/namei.c | 77 ++++++++++++++---------------
lxc-dave/fs/open.c | 6 --
lxc-dave/fs/reiserfs/journal.c | 2
lxc-dave/include/linux/fs.h | 2
lxc-dave/kernel/acct.c | 2
lxc-dave/mm/swapfile.c | 4 -
lxc-dave/sound/sound_firmware.c | 2
9 files changed, 51 insertions(+), 51 deletions(-)
diff -puN drivers/usb/gadget/file_storage.c~kill-filp_open drivers/usb/gadget/file_storage.c
--- lxc/drivers/usb/gadget/file_storage.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/drivers/usb/gadget/file_storage.c 2007-09-28 11:03:27.000000000 -0700
@@ -3468,16 +3468,17 @@ static int open_backing_file(struct lun
struct inode *inode = NULL;
loff_t size;
loff_t num_sectors;
+ int mode = O_LARGEFILE;
/* R/W if we can, R/O if we must */
ro = curlun->ro;
if (!ro) {
- filp = filp_open(filename, O_RDWR | O_LARGEFILE, 0);
+ filp = open_namei(AT_FDCWD, filename, O_RDWR | mode, 0);
if (-EROFS == PTR_ERR(filp))
ro = 1;
}
if (ro)
- filp = filp_open(filename, O_RDONLY | O_LARGEFILE, 0);
+ filp = open_namei(AT_FDCWD, filename, O_RDONLY | mode, 0);
if (IS_ERR(filp)) {
LINFO(curlun, "unable to open backing file: %s\n", filename);
return PTR_ERR(filp);
diff -puN fs/exec.c~kill-filp_open fs/exec.c
--- lxc/fs/exec.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/exec.c 2007-09-28 11:03:27.000000000 -0700
@@ -1764,7 +1764,7 @@ int do_coredump(long signr, int exit_cod
goto fail_unlock;
}
} else
- file = filp_open(corename,
+ file = open_namei(AT_FDCWD, corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
if (IS_ERR(file))
diff -puN fs/namei.c~kill-filp_open fs/namei.c
--- lxc/fs/namei.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:27.000000000 -0700
@@ -1751,8 +1751,9 @@ static inline int sys_open_flags_to_name
* SMP-safe
*/
struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
- int mode, struct nameidata *nd)
+ int mode)
{
+ struct nameidata nd;
int acc_mode, error;
struct path path;
struct dentry *dir;
@@ -1775,7 +1776,7 @@ struct file *open_namei(int dfd, const c
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
- nd, flag);
+ &nd, flag);
if (error)
return ERR_PTR(error);
goto ok;
@@ -1784,7 +1785,7 @@ struct file *open_namei(int dfd, const c
/*
* Create - we need to know the parent.
*/
- error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+ error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,&nd,flag,mode);
if (error)
return ERR_PTR(error);
@@ -1794,14 +1795,14 @@ struct file *open_namei(int dfd, const c
* will not do.
*/
error = -EISDIR;
- if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+ if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
goto exit;
- dir = nd->dentry;
- nd->flags &= ~LOOKUP_PARENT;
+ dir = nd.dentry;
+ nd.flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->mnt;
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.mnt;
do_last:
error = PTR_ERR(path.dentry);
@@ -1810,17 +1811,17 @@ do_last:
goto exit;
}
- if (IS_ERR(nd->intent.open.file)) {
- error = PTR_ERR(nd->intent.open.file);
+ if (IS_ERR(nd.intent.open.file)) {
+ error = PTR_ERR(nd.intent.open.file);
goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = __open_namei_create(nd, &path, flag, mode);
+ error = __open_namei_create(&nd, &path, flag, mode);
if (error)
goto exit;
- return nameidata_to_filp(nd, sys_open_flag);
+ return nameidata_to_filp(&nd, sys_open_flag);
}
/*
@@ -1845,7 +1846,7 @@ do_last:
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
goto do_link;
- path_to_nameidata(&path, nd);
+ path_to_nameidata(&path, &nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
@@ -1854,26 +1855,26 @@ ok:
* This mnt_want_write() is potentially persistent,
* and balanced in __fput()
*/
- if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
- error = mnt_want_write(nd->mnt);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ error = mnt_want_write(nd.mnt);
if (error)
goto exit;
- error = may_open(nd, acc_mode, flag);
+ error = may_open(&nd, acc_mode, flag);
if (error) {
- if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
- mnt_drop_write(nd->mnt);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ mnt_drop_write(nd.mnt);
goto exit;
}
- return nameidata_to_filp(nd, sys_open_flag);
+ return nameidata_to_filp(&nd, sys_open_flag);
exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
- dput_path(&path, nd);
+ dput_path(&path, &nd);
exit:
- if (!IS_ERR(nd->intent.open.file))
- release_open_intent(nd);
- path_release(nd);
+ if (!IS_ERR(&nd.intent.open.file))
+ release_open_intent(&nd);
+ path_release(&nd);
return ERR_PTR(error);
do_link:
@@ -1887,42 +1888,42 @@ do_link:
* After that we have the parent and last component, i.e.
* we are in the same situation as after the first path_walk().
* Well, almost - if the last component is normal we get its copy
- * stored in nd->last.name and we will have to putname() it when we
+ * stored in nd.last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
- nd->flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, nd);
+ nd.flags |= LOOKUP_PARENT;
+ error = security_inode_follow_link(path.dentry, &nd);
if (error)
goto exit_dput;
- error = __do_follow_link(&path, nd);
+ error = __do_follow_link(&path, &nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
- release_open_intent(nd);
+ release_open_intent(&nd);
return ERR_PTR(error);
}
- nd->flags &= ~LOOKUP_PARENT;
- if (nd->last_type == LAST_BIND)
+ nd.flags &= ~LOOKUP_PARENT;
+ if (nd.last_type == LAST_BIND)
goto ok;
error = -EISDIR;
- if (nd->last_type != LAST_NORM)
+ if (nd.last_type != LAST_NORM)
goto exit;
- if (nd->last.name[nd->last.len]) {
- __putname(nd->last.name);
+ if (nd.last.name[nd.last.len]) {
+ __putname(nd.last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
- __putname(nd->last.name);
+ __putname(nd.last.name);
goto exit;
}
- dir = nd->dentry;
+ dir = nd.dentry;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->mnt;
- __putname(nd->last.name);
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.mnt;
+ __putname(nd.last.name);
goto do_last;
}
diff -puN fs/open.c~kill-filp_open fs/open.c
--- lxc/fs/open.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:27.000000000 -0700
@@ -842,8 +842,7 @@ cleanup_file:
struct file *filp_open(const char *filename, int flags, int mode)
{
- struct nameidata nd;
- return open_namei(AT_FDCWD, filename, flags, mode, &nd);
+ return open_namei(AT_FDCWD, filename, flags, mode);
}
EXPORT_SYMBOL(filp_open);
@@ -1047,8 +1046,7 @@ long do_sys_open(int dfd, const char __u
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
- struct nameidata nd;
- struct file *f = open_namei(dfd, tmp, flags, mode, &nd);
+ struct file *f = open_namei(dfd, tmp, flags, mode);
if (IS_ERR(f)) {
put_unused_fd(fd);
diff -puN fs/reiserfs/journal.c~kill-filp_open fs/reiserfs/journal.c
--- lxc/fs/reiserfs/journal.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/reiserfs/journal.c 2007-09-28 11:03:27.000000000 -0700
@@ -2623,7 +2623,7 @@ static int journal_init_dev(struct super
return 0;
}
- journal->j_dev_file = filp_open(jdev_name, 0, 0);
+ journal->j_dev_file = open_namei(AT_FDCWD, jdev_name, 0, 0);
if (!IS_ERR(journal->j_dev_file)) {
struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
if (!S_ISBLK(jdev_inode->i_mode)) {
diff -puN include/linux/fs.h~kill-filp_open include/linux/fs.h
--- lxc/include/linux/fs.h~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:27.000000000 -0700
@@ -1721,7 +1721,7 @@ extern struct file *create_read_pipe(str
extern struct file *create_write_pipe(void);
extern void free_write_pipe(struct file *);
-extern struct file *open_namei(int dfd, const char *, int, int, struct nameidata *);
+extern struct file *open_namei(int dfd, const char *, int, int);
extern int may_open(struct nameidata *, int, int);
extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
diff -puN kernel/acct.c~kill-filp_open kernel/acct.c
--- lxc/kernel/acct.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/kernel/acct.c 2007-09-28 11:03:27.000000000 -0700
@@ -208,7 +208,7 @@ static int acct_on(char *name)
int error;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = open_namei(AT_FDCWD, name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
diff -puN mm/swapfile.c~kill-filp_open mm/swapfile.c
--- lxc/mm/swapfile.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/mm/swapfile.c 2007-09-28 11:03:27.000000000 -0700
@@ -1198,7 +1198,7 @@ asmlinkage long sys_swapoff(const char _
if (IS_ERR(pathname))
goto out;
- victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+ victim = open_namei(AT_FDCWD, pathname, O_RDWR|O_LARGEFILE, 0);
putname(pathname);
err = PTR_ERR(victim);
if (IS_ERR(victim))
@@ -1477,7 +1477,7 @@ asmlinkage long sys_swapon(const char __
name = NULL;
goto bad_swap_2;
}
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ swap_file = open_namei(AT_FDCWD, name, O_RDWR|O_LARGEFILE, 0);
error = PTR_ERR(swap_file);
if (IS_ERR(swap_file)) {
swap_file = NULL;
diff -puN sound/sound_firmware.c~kill-filp_open sound/sound_firmware.c
--- lxc/sound/sound_firmware.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/sound/sound_firmware.c 2007-09-28 11:03:27.000000000 -0700
@@ -14,7 +14,7 @@ static int do_mod_firmware_load(const ch
char *dp;
loff_t pos;
- filp = filp_open(fn, 0, 0);
+ filp = open_namei(AT_FDCWD, fn, 0, 0);
if (IS_ERR(filp))
{
printk(KERN_INFO "Unable to load '%s'.\n", fn);
_
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC][PATCH 8/8] keep track of mnt_writer state of struct file
2007-09-28 18:13 [RFC][PATCH 1/8] get mount write in __dentry_open() Dave Hansen
` (5 preceding siblings ...)
2007-09-28 18:13 ` [RFC][PATCH 7/8] kill filp_open() Dave Hansen
@ 2007-09-28 18:13 ` Dave Hansen
6 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2007-09-28 18:13 UTC (permalink / raw)
To: linux-kernel; +Cc: hch, miklos, Dave Hansen
There have been a few oopses caused by 'struct file's with
NULL f_vfsmnts. There was also a set of potentially missed
mnt_want_write()s from dentry_open() calls.
This patch provides a very simple debugging framework to
catch these kinds of bugs. It will WARN_ON() them, but
should stop us from having any oopses or mnt_writer
count imbalances.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/fs/file_table.c | 21 +++++++++++++++++++--
lxc-dave/fs/namei.c | 15 +++++++++++++--
lxc-dave/include/linux/fs.h | 4 ++++
3 files changed, 36 insertions(+), 4 deletions(-)
diff -puN fs/file_table.c~keep-track-of-mnt_writer-state-of-struct-file fs/file_table.c
--- lxc/fs/file_table.c~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/fs/file_table.c 2007-09-28 11:03:50.000000000 -0700
@@ -42,6 +42,12 @@ static inline void file_free_rcu(struct
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
+ /*
+ * At this point, either both or neither of these bits
+ * should be set.
+ */
+ WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
+ WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -194,6 +200,7 @@ int init_file(struct file *file, struct
file->f_mode = mode;
file->f_op = fop;
if (mode & FMODE_WRITE) {
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
error = mnt_want_write(mnt);
WARN_ON(error);
}
@@ -236,8 +243,18 @@ void fastcall __fput(struct file *file)
fops_put(file->f_op);
if (file->f_mode & FMODE_WRITE) {
put_write_access(inode);
- if (!special_file(inode->i_mode))
- mnt_drop_write(mnt);
+ if (!special_file(inode->i_mode)) {
+ if (file->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) {
+ mnt_drop_write(mnt);
+ file->f_mnt_write_state |=
+ FILE_MNT_WRITE_RELEASED;
+ } else {
+ printk(KERN_WARNING "__fput() of writeable "
+ "file with no "
+ "mnt_want_write()\n");
+ WARN_ON(1);
+ }
+ }
}
put_pid(file->f_owner.pid);
file_kill(file);
diff -puN fs/namei.c~keep-track-of-mnt_writer-state-of-struct-file fs/namei.c
--- lxc/fs/namei.c~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:50.000000000 -0700
@@ -1753,6 +1753,7 @@ static inline int sys_open_flags_to_name
struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
int mode)
{
+ struct file *file;
struct nameidata nd;
int acc_mode, error;
struct path path;
@@ -1821,7 +1822,14 @@ do_last:
error = __open_namei_create(&nd, &path, flag, mode);
if (error)
goto exit;
- return nameidata_to_filp(&nd, sys_open_flag);
+ file = nameidata_to_filp(&nd, sys_open_flag);
+ /*
+ * The mnt_want_write happened in
+ * __open_namei_create(), but it
+ * happened unconditionally, so
+ * this is safe to assume.
+ */
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
}
/*
@@ -1865,7 +1873,10 @@ ok:
mnt_drop_write(nd.mnt);
goto exit;
}
- return nameidata_to_filp(&nd, sys_open_flag);
+ file = nameidata_to_filp(&nd, sys_open_flag);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
+ return file;
exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
diff -puN include/linux/fs.h~keep-track-of-mnt_writer-state-of-struct-file include/linux/fs.h
--- lxc/include/linux/fs.h~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:50.000000000 -0700
@@ -779,6 +779,9 @@ static inline int ra_has_index(struct fi
index < ra->start + ra->size);
}
+#define FILE_MNT_WRITE_TAKEN 1
+#define FILE_MNT_WRITE_RELEASED 2
+
struct file {
/*
* fu_list becomes invalid after file_free is called and queued via
@@ -813,6 +816,7 @@ struct file {
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
+ unsigned long f_mnt_write_state;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
_
^ permalink raw reply [flat|nested] 10+ messages in thread