Re: [PATCH -V5 3/8] vfs: Add open by file handle support

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Neil Brown <neilb@suse.de>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: hch@infradead.org, viro@zeniv.linux.org.uk, adilger@sun.com,
	corbet@lwn.net, serue@us.ibm.com, linux-fsdevel@vger.kernel.org,
	sfrench@us.ibm.com
Subject: Re: [PATCH -V5 3/8] vfs: Add open by file handle support
Date: Tue, 27 Apr 2010 11:05:45 +1000	[thread overview]
Message-ID: <20100427110545.2dd50b36@notabene.brown> (raw)
In-Reply-To: <1272303246-25570-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, 26 Apr 2010 23:04:01 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Acked-by: Serge Hallyn <serue@us.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  fs/filesystems.c              |   32 +++++++++-
>  fs/namei.c                    |   24 -------
>  fs/namespace.c                |   38 +++++++++++
>  fs/open.c                     |  148 +++++++++++++++++++++++++++++++++++++++++
>  fs/pnode.c                    |    2 +-
>  include/linux/mnt_namespace.h |    2 +
>  include/linux/namei.h         |   24 +++++++
>  7 files changed, 244 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/filesystems.c b/fs/filesystems.c
> index 68ba492..743d36e 100644
> --- a/fs/filesystems.c
> +++ b/fs/filesystems.c
> @@ -281,5 +281,35 @@ struct file_system_type *get_fs_type(const char *name)
>  	}
>  	return fs;
>  }
> -
>  EXPORT_SYMBOL(get_fs_type);
> +
> +struct super_block *fs_get_sb(struct uuid *fsid)
> +{
> +	struct uuid *this_fsid;
> +	struct file_system_type *fs_type;
> +	struct super_block *sb, *found_sb = NULL;
> +
> +	read_lock(&file_systems_lock);
> +	fs_type = file_systems;
> +	while (fs_type) {

You've open-coded a for-loop here...
Why not:
        for (fs_type = file_systems ; fs_type ; fs_type = fs_type->next) {
??


> +		spin_lock(&sb_lock);
> +		list_for_each_entry(sb, &fs_type->fs_supers, s_instances) {
> +			if (!sb->s_op->get_fsid)
> +				continue;
> +			this_fsid = sb->s_op->get_fsid(sb);
> +			if (!memcmp(fsid->uuid, this_fsid->uuid,
> +					sizeof(this_fsid->uuid))) {
> +				/* found the matching super_block */
> +				atomic_inc(&sb->s_active);
> +				found_sb = sb;
> +				spin_unlock(&sb_lock);
> +				goto out;
> +			}
> +		}
> +		spin_unlock(&sb_lock);
> +		fs_type = fs_type->next;
> +	}
> +out:
> +	read_unlock(&file_systems_lock);
> +	return found_sb;
> +}
> diff --git a/fs/namei.c b/fs/namei.c
> index a7dce91..a18711e 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1521,30 +1521,6 @@ out_unlock:
>  	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
>  }
>  
> -/*
> - * Note that while the flag value (low two bits) for sys_open means:
> - *	00 - read-only
> - *	01 - write-only
> - *	10 - read-write
> - *	11 - special
> - * it is changed into
> - *	00 - no permissions needed
> - *	01 - read-permission
> - *	10 - write-permission
> - *	11 - read-write
> - * for the internal routines (ie open_namei()/follow_link() etc)
> - * This is more logical, and also allows the 00 "no perm needed"
> - * to be used for symlinks (where the permissions are checked
> - * later).
> - *
> -*/
> -static inline int open_to_namei_flags(int flag)
> -{
> -	if ((flag+1) & O_ACCMODE)
> -		flag++;
> -	return flag;
> -}
> -
>  static int open_will_truncate(int flag, struct inode *inode)
>  {
>  	/*
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8174c8a..6168526 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2364,3 +2364,41 @@ void put_mnt_ns(struct mnt_namespace *ns)
>  	kfree(ns);
>  }
>  EXPORT_SYMBOL(put_mnt_ns);
> +
> +/*
> + * Get any vfsmount mapping the superblock in the
> + * task namespace
> + */
> +struct vfsmount *fs_get_vfsmount(struct task_struct *task,
> +				struct super_block *sb)
> +{
> +	struct nsproxy *nsp;
> +	struct list_head *mount_list;
> +	struct mnt_namespace *ns = NULL;
> +	struct vfsmount *mnt, *sb_mnt = NULL;
> +
> +	rcu_read_lock();
> +	nsp = task_nsproxy(task);
> +	if (nsp) {
> +		ns = nsp->mnt_ns;
> +		if (ns)
> +			get_mnt_ns(ns);
> +	}
> +	rcu_read_unlock();
> +	if (!ns)
> +		return NULL;
> +	down_read(&namespace_sem);
> +	list_for_each(mount_list, &ns->list) {
> +		mnt = list_entry(mount_list, struct vfsmount, mnt_list);
> +		if (mnt->mnt_sb == sb) {
> +			/* found the matching super block */
> +			sb_mnt = mnt;
> +			mntget(sb_mnt);
> +			break;
> +		}
> +	}
> +	up_read(&namespace_sem);
> +
> +	put_mnt_ns(ns);
> +	return sb_mnt;
> +}

If a task has the same filesystem mounted several times in it's namespace
(via "mount --bind") you just return any arbitrary one (the first).

Suppose that the filesystem does appear twice, and that in one appearance a
particular directory is mounted on, while in another appearance that
directory is not mounted on.  Suppose that the mounted-on appearance is
listed first in ns->list

Now we use a series of "name_to_handle" / "open_by_handle" calls to descend
through the second appearance to something beneath that directory which (in
the first appearance) is mounted-on.
This will not work as you will actually be descending the first appearance of
the filesystem.
I really think you should leave the choice of filesystem/mountpoint to
user-space.


> diff --git a/fs/open.c b/fs/open.c
> index e9aae5c..b87fa32 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1306,3 +1306,151 @@ SYSCALL_DEFINE2(lname_to_handle, const char __user *, name,
>  	asmlinkage_protect(2, ret, name, handle);
>  	return ret;
>  }
> +
> +static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
> +{
> +	return 1;
> +}
> +
> +static struct dentry *handle_to_dentry(struct vfsmount *mnt,
> +				struct file_handle *handle)
> +{
> +	int handle_size;
> +	struct dentry *dentry;
> +
> +	/* change the handle size to multiple of sizeof(u32) */
> +	handle_size = handle->handle_size >> 2;
> +	dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle,
> +					handle_size, handle->handle_type,
> +					vfs_dentry_acceptable, NULL);
> +	return dentry;
> +}
> +
> +long do_sys_open_by_handle(struct file_handle __user *ufh, int flags)
> +{
> +	int fd;
> +	int retval = 0;
> +	int d_flags  = flags;
> +	struct file *filp;
> +	struct vfsmount *mnt;
> +	struct inode *inode;
> +	struct dentry *dentry;
> +	struct super_block *sb;
> +	struct file_handle f_handle;
> +	struct file_handle *handle = NULL;
> +
> +	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
> +		return  -EFAULT;
> +
> +	if ((f_handle.handle_size > MAX_HANDLE_SZ) ||
> +		(f_handle.handle_size <= 0))
> +		return -EINVAL;
> +
> +	if (!capable(CAP_DAC_OVERRIDE))
> +		return -EPERM;
> +
> +	sb = fs_get_sb(&f_handle.fsid);
> +	if (!sb)
> +		return -ESTALE;
> +	/*
> +	 * Find the vfsmount for this superblock in the
> +	 * current namespace
> +	 */
> +	mnt = fs_get_vfsmount(current, sb);
> +	if (!mnt) {
> +		retval = -ESTALE;
> +		goto out_sb;
> +	}
> +
> +	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size,
> +			GFP_KERNEL);
> +	if (!handle) {
> +		retval =  -ENOMEM;
> +		goto out_mnt;
> +	}
> +	/* copy the full handle */
> +	if (copy_from_user(handle, ufh,
> +				sizeof(struct file_handle) +
> +				f_handle.handle_size)) {
> +		retval = -EFAULT;
> +		goto out_mnt;
> +	}
> +
> +	dentry = handle_to_dentry(mnt, handle);
> +	if (IS_ERR(dentry)) {
> +		retval = PTR_ERR(dentry);
> +		goto out_mnt;
> +	}
> +
> +	inode = dentry->d_inode;
> +
> +	flags  = open_to_namei_flags(flags);
> +	/* O_TRUNC implies we need access checks for write permissions */
> +	if (flags & O_TRUNC)
> +		flags |= MAY_WRITE;
> +
> +	if ((!(flags & O_APPEND) || (flags & O_TRUNC)) &&
> +		(flags & FMODE_WRITE) && IS_APPEND(inode)) {
> +		retval = -EPERM;
> +		goto out_err;
> +	}
> +
> +	if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
> +		retval = -EACCES;
> +		goto out_err;
> +	}
> +
> +	/* Can't write directories. */
> +	if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) {
> +		retval = -EISDIR;
> +		goto out_err;
> +	}
> +
> +	fd = get_unused_fd();

You want to use alloc_fd() (or get_unused_fd_flags) rather than get_unused_fd
so that the flags can be passed though and, in particular, so O_CLOEXEC will
be honoured.


> +	if (fd < 0) {
> +		retval = fd;
> +		goto out_err;
> +	}
> +
> +	filp = dentry_open(dentry, mntget(mnt),
> +			d_flags, current_cred());

dentry_open consumed the reference to dentry, so ...

> +	if (IS_ERR(filp)) {
> +		put_unused_fd(fd);
> +		retval =  PTR_ERR(filp);
> +		goto out_err;

The dput at 'out_err' will put it one time too many.
I think the best fix would be to pass dget(dentry) to dentry_open, then ....

> +	}
> +
> +	if (inode->i_mode & S_IFREG) {
> +		filp->f_flags |= O_NOATIME;
> +		filp->f_mode |= FMODE_NOCMTIME;
> +	}
> +	fsnotify_open(filp->f_path.dentry);
> +	fd_install(fd, filp);
> +	retval = fd;
> +	goto out_mnt;

... this goto (Which is rather ugly to me) would not be needed.

NeilBrown


> +
> +out_err:
> +	dput(dentry);
> +out_mnt:
> +	kfree(handle);
> +	mntput(mnt);
> +out_sb:
> +	deactivate_super(sb);
> +
> +	return retval;
> +}
> +
> +SYSCALL_DEFINE2(open_by_handle, struct file_handle __user *, handle,
> +		int, flags)
> +{
> +	long ret;
> +
> +	if (force_o_largefile())
> +		flags |= O_LARGEFILE;
> +
> +	ret = do_sys_open_by_handle(handle, flags);
> +
> +	/* avoid REGPARM breakage on x86: */
> +	asmlinkage_protect(2, ret, handle, flags);
> +	return ret;
> +}
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5cc564a..9f6d12d 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -6,9 +6,9 @@
>   *	Author : Ram Pai (linuxram@us.ibm.com)
>   *
>   */
> +#include <linux/fs.h>
>  #include <linux/mnt_namespace.h>
>  #include <linux/mount.h>
> -#include <linux/fs.h>
>  #include "internal.h"
>  #include "pnode.h"
>  
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 0b89efc..d363ecc 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -36,6 +36,8 @@ extern const struct seq_operations mounts_op;
>  extern const struct seq_operations mountinfo_op;
>  extern const struct seq_operations mountstats_op;
>  extern int mnt_had_events(struct proc_mounts *);
> +extern struct vfsmount *fs_get_vfsmount(struct task_struct *task,
> +					struct super_block *sb);
>  
>  #endif
>  #endif
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index 05b441d..a853aa0 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -4,6 +4,7 @@
>  #include <linux/dcache.h>
>  #include <linux/linkage.h>
>  #include <linux/path.h>
> +#include <asm-generic/fcntl.h>
>  
>  struct vfsmount;
>  
> @@ -96,4 +97,27 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
>  	((char *) name)[min(len, maxlen)] = '\0';
>  }
>  
> +/*
> + * Note that while the flag value (low two bits) for sys_open means:
> + *	00 - read-only
> + *	01 - write-only
> + *	10 - read-write
> + *	11 - special
> + * it is changed into
> + *	00 - no permissions needed
> + *	01 - read-permission
> + *	10 - write-permission
> + *	11 - read-write
> + * for the internal routines (ie open_namei()/follow_link() etc)
> + * This is more logical, and also allows the 00 "no perm needed"
> + * to be used for symlinks (where the permissions are checked
> + * later).
> + *
> +*/
> +static inline int open_to_namei_flags(int flag)
> +{
> +	if ((flag+1) & O_ACCMODE)
> +		flag++;
> +	return flag;
> +}
>  #endif /* _LINUX_NAMEI_H */

next prev parent reply	other threads:[~2010-04-27  1:05 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-26 17:33 [PATCH -V5 0/8] Generic name to handle and open by handle syscalls Aneesh Kumar K.V
2010-04-26 17:33 ` [PATCH -V5 1/8] exportfs: Return the minimum required handle size Aneesh Kumar K.V
2010-04-26 17:34 ` [PATCH -V5 2/8] vfs: Add name to file handle conversion support Aneesh Kumar K.V
2010-04-27  1:19   ` Neil Brown
2010-04-27  6:08     ` Aneesh Kumar K. V
2010-04-26 17:34 ` [PATCH -V5 3/8] vfs: Add open by file handle support Aneesh Kumar K.V
2010-04-27  1:05   ` Neil Brown [this message]
2010-04-27  6:10     ` Aneesh Kumar K. V
2010-04-26 17:34 ` [PATCH -V5 4/8] vfs: Add freadlink syscall Aneesh Kumar K.V
2010-04-26 17:34 ` [PATCH -V5 5/8] ext4: Add get_fsid callback Aneesh Kumar K.V
2010-04-26 17:34 ` [PATCH -V5 6/8] x86: Add new syscalls for x86_32 Aneesh Kumar K.V
2010-04-26 17:34 ` [PATCH -V5 7/8] x86: Add new syscalls for x86_64 Aneesh Kumar K.V
2010-04-26 17:34 ` [PATCH -V5 8/8] ext3: Add get_fsid callback Aneesh Kumar K.V

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100427110545.2dd50b36@notabene.brown \
    --to=neilb@suse.de \
    --cc=adilger@sun.com \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=corbet@lwn.net \
    --cc=hch@infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=serue@us.ibm.com \
    --cc=sfrench@us.ibm.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).