* [RFC PATCH v2 1/2] fs: add AT_REPLACE flag for linkat() which replaces the target
2017-03-21 14:51 [RFC PATCH v2 0/2] fs: add AT_REPLACE flag for linkat() Omar Sandoval
@ 2017-03-21 14:51 ` Omar Sandoval
2017-03-21 16:30 ` Linus Torvalds
2017-03-21 14:51 ` [RFC PATCH v2 2/2] Btrfs: add support for linkat() AT_REPLACE Omar Sandoval
1 sibling, 1 reply; 8+ messages in thread
From: Omar Sandoval @ 2017-03-21 14:51 UTC (permalink / raw)
To: Al Viro, linux-fsdevel
Cc: Linus Torvalds, linux-api, kernel-team, Xi Wang, Omar Sandoval
From: Omar Sandoval <osandov@fb.com>
One of the most common uses of temporary files is the classic atomic
replacement pattern, i.e.,
- write temporary file
- fsync temporary file
- rename temporary file over real file
- fsync parent directory
Now, we have O_TMPFILE, which gives us a much better way to create
temporary files, but it's not possible to use it for this pattern.
This patch introduces an AT_REPLACE flag which allows linkat() to
replace the target file. Now, the temporary file in the pattern above
can be a proper O_TMPFILE. Even without O_TMPFILE, this is a new
primitive which might be useful in other contexts.
Cc: Xi Wang <xi@cs.washington.edu>
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
fs/ecryptfs/inode.c | 2 +-
fs/namei.c | 181 ++++++++++++++++++++++++++++++++++++---------
fs/nfsd/vfs.c | 2 +-
fs/overlayfs/overlayfs.h | 2 +-
include/linux/fs.h | 3 +-
include/uapi/linux/fcntl.h | 1 +
6 files changed, 150 insertions(+), 41 deletions(-)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index efc2db42d175..784c846b781e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -442,7 +442,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
dget(lower_new_dentry);
lower_dir_dentry = lock_parent(lower_new_dentry);
rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
- lower_new_dentry, NULL);
+ lower_new_dentry, NULL, 0);
if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
diff --git a/fs/namei.c b/fs/namei.c
index d41fab78798b..89e5049fe19b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4119,6 +4119,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* @dir: new parent
* @new_dentry: where to create the new link
* @delegated_inode: returns inode needing a delegation break
+ * @flags: link flags
*
* The caller must hold dir->i_mutex
*
@@ -4132,16 +4133,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, struct inode **delegated_inode,
+ unsigned int flags)
{
struct inode *inode = old_dentry->d_inode;
+ struct inode *target = new_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
int error;
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry);
+ if (target) {
+ if (flags & AT_REPLACE)
+ error = may_delete(dir, new_dentry, d_is_dir(old_dentry));
+ else
+ error = -EEXIST;
+ } else {
+ error = may_create(dir, new_dentry);
+ }
if (error)
return error;
@@ -4160,8 +4171,10 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
*/
if (HAS_UNMAPPED_ID(inode))
return -EPERM;
- if (!dir->i_op->link)
+ if (!dir->i_op->link && !dir->i_op->link2)
return -EPERM;
+ if (flags && !dir->i_op->link2)
+ return -EINVAL;
if (S_ISDIR(inode->i_mode))
return -EPERM;
@@ -4169,26 +4182,58 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (error)
return error;
- inode_lock(inode);
+ dget(new_dentry);
+ lock_two_nondirectories(inode, target);
+
+ if (is_local_mountpoint(new_dentry)) {
+ error = -EBUSY;
+ goto out;
+ }
+
/* Make sure we don't allow creating hardlink to an unlinked file */
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) {
error = -ENOENT;
- else if (max_links && inode->i_nlink >= max_links)
+ goto out;
+ }
+ if (max_links && inode->i_nlink >= max_links) {
error = -EMLINK;
- else {
- error = try_break_deleg(inode, delegated_inode);
- if (!error)
- error = dir->i_op->link(old_dentry, dir, new_dentry);
+ goto out;
+ }
+
+ error = try_break_deleg(inode, delegated_inode);
+ if (error)
+ goto out;
+ if (target) {
+ error = try_break_deleg(target, delegated_inode);
+ if (error)
+ goto out;
+ }
+
+ if (dir->i_op->link)
+ error = dir->i_op->link(old_dentry, dir, new_dentry);
+ else
+ error = dir->i_op->link2(old_dentry, dir, new_dentry, flags);
+ if (error)
+ goto out;
+
+ if (target) {
+ dont_mount(new_dentry);
+ detach_mounts(new_dentry);
}
- if (!error && (inode->i_state & I_LINKABLE)) {
+ if (inode->i_state & I_LINKABLE) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_LINKABLE;
spin_unlock(&inode->i_lock);
}
- inode_unlock(inode);
- if (!error)
+out:
+ unlock_two_nondirectories(inode, target);
+ dput(new_dentry);
+ if (!error) {
+ if (target)
+ fsnotify_link_count(target);
fsnotify_link(dir, inode, new_dentry);
+ }
return error;
}
EXPORT_SYMBOL(vfs_link);
@@ -4207,11 +4252,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
{
struct dentry *new_dentry;
struct path old_path, new_path;
+ struct qstr new_last;
+ int new_type;
struct inode *delegated_inode = NULL;
- int how = 0;
+ struct filename *to;
+ unsigned int how = 0, target_flags;
+ bool should_retry = false;
int error;
- if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+ if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_REPLACE)) != 0)
return -EINVAL;
/*
* To use null names we require CAP_DAC_READ_SEARCH
@@ -4226,44 +4275,102 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
+
+ if (flags & AT_REPLACE)
+ target_flags = LOOKUP_RENAME_TARGET;
+ else
+ target_flags = LOOKUP_CREATE | LOOKUP_EXCL;
retry:
error = user_path_at(olddfd, oldname, how, &old_path);
if (error)
return error;
- new_dentry = user_path_create(newdfd, newname, &new_path,
- (how & LOOKUP_REVAL));
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto out;
+ to = filename_parentat(newdfd, getname(newname), how & LOOKUP_REVAL,
+ &new_path, &new_last, &new_type);
+ if (IS_ERR(to)) {
+ error = PTR_ERR(to);
+ goto exit1;
+ }
+
+ if (old_path.mnt != new_path.mnt) {
+ error = -EXDEV;
+ goto exit2;
+ }
+
+ if (new_type != LAST_NORM) {
+ if (flags & AT_REPLACE)
+ error = -EBUSY;
+ else
+ error = -EEXIST;
+ goto exit2;
+ }
+
+ error = mnt_want_write(old_path.mnt);
+ if (error)
+ goto exit2;
+
+retry_deleg:
+ inode_lock_nested(new_path.dentry->d_inode, I_MUTEX_PARENT);
+
+ new_dentry = __lookup_hash(&new_last, new_path.dentry,
+ (how & LOOKUP_REVAL) | target_flags);
+ if (IS_ERR(new_dentry)) {
+ error = PTR_ERR(new_dentry);
+ goto exit3;
+ }
+ if (!(flags & AT_REPLACE) && d_is_positive(new_dentry)) {
+ error = -EEXIST;
+ goto exit4;
+ }
+ if (new_last.name[new_last.len]) {
+ /* trailing slash on negative dentry gives -ENOENT */
+ if (d_is_negative(new_dentry)) {
+ error = -ENOENT;
+ goto exit4;
+ }
+
+ /*
+ * unless the source is a directory, trailing slash gives
+ * -ENOTDIR (this can only happen in the AT_REPLACE case, so we
+ * make this consistent with sys_renameat2() even though a
+ * source directory will fail later with -EPERM)
+ */
+ if (!d_is_dir(old_path.dentry)) {
+ error = -ENOTDIR;
+ goto exit4;
+ }
+ }
- error = -EXDEV;
- if (old_path.mnt != new_path.mnt)
- goto out_dput;
error = may_linkat(&old_path);
if (unlikely(error))
- goto out_dput;
+ goto exit4;
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
- goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
-out_dput:
- done_path_create(&new_path, new_dentry);
+ goto exit4;
+ error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry,
+ &delegated_inode, flags & AT_REPLACE);
+exit4:
+ dput(new_dentry);
+exit3:
+ inode_unlock(new_path.dentry->d_inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
- if (!error) {
- path_put(&old_path);
- goto retry;
- }
+ if (!error)
+ goto retry_deleg;
}
- if (retry_estale(error, how)) {
- path_put(&old_path);
+ mnt_drop_write(old_path.mnt);
+exit2:
+ if (retry_estale(error, how))
+ should_retry = true;
+ path_put(&new_path);
+ putname(to);
+exit1:
+ path_put(&old_path);
+ if (should_retry) {
+ should_retry = false;
how |= LOOKUP_REVAL;
goto retry;
}
-out:
- path_put(&old_path);
-
return error;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..a81f616fe146 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1589,7 +1589,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = nfserr_noent;
if (d_really_is_negative(dold))
goto out_dput;
- host_err = vfs_link(dold, dirp, dnew, NULL);
+ host_err = vfs_link(dold, dirp, dnew, NULL, 0);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 741dc0b6931f..b6bd64d2a097 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -40,7 +40,7 @@ static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
- int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+ int err = vfs_link(old_dentry, dir, new_dentry, NULL, 0);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..a8941949d272 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1561,7 +1561,7 @@ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
-extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
extern int vfs_rmdir(struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
@@ -1701,6 +1701,7 @@ struct inode_operations {
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*link2) (struct dentry *,struct inode *,struct dentry *,unsigned int);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 813afd6eee71..37afa527c742 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -62,6 +62,7 @@
#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
+#define AT_REPLACE 0x2000 /* Replace new path */
#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
--
2.12.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [RFC PATCH v2 2/2] Btrfs: add support for linkat() AT_REPLACE
2017-03-21 14:51 [RFC PATCH v2 0/2] fs: add AT_REPLACE flag for linkat() Omar Sandoval
2017-03-21 14:51 ` [RFC PATCH v2 1/2] fs: add AT_REPLACE flag for linkat() which replaces the target Omar Sandoval
@ 2017-03-21 14:51 ` Omar Sandoval
1 sibling, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2017-03-21 14:51 UTC (permalink / raw)
To: Al Viro, linux-fsdevel
Cc: Linus Torvalds, linux-api, kernel-team, Xi Wang, Omar Sandoval
From: Omar Sandoval <osandov@fb.com>
The implementation is fairly straightforward and looks a lot like
btrfs_rename().
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
fs/btrfs/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 61 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c40060cc481f..188e8e5fca50 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6468,16 +6468,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
+ struct dentry *dentry, unsigned int flags)
{
struct btrfs_trans_handle *trans = NULL;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 index;
int err;
int drop_inode = 0;
+ if (flags & ~AT_REPLACE)
+ return -EINVAL;
+
/* do not allow sys_link's with other subvols of the same device */
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EXDEV;
@@ -6485,16 +6490,47 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
+ /* check for collisions, even if the name isn't there */
+ err = btrfs_check_dir_item_collision(root, dir->i_ino,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (err) {
+ if (err == -EEXIST) {
+ if (WARN_ON(!new_inode))
+ return err;
+ } else {
+ return err;
+ }
+ }
+
+ /*
+ * we're using link to replace one file with another. Start IO on it now
+ * so we don't add too much work to the end of the transaction
+ */
+ if (new_inode && S_ISREG(inode->i_mode) && new_inode->i_size)
+ filemap_flush(inode->i_mapping);
+
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (err)
goto fail;
/*
+ * For the source:
* 2 items for inode and inode ref
* 2 items for dir items
* 1 item for parent inode
+ *
+ * For the target:
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
+ trans_num_items = 5;
+ if (new_inode)
+ trans_num_items += 5;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
@@ -6506,6 +6542,22 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
+
+ if (new_inode) {
+ inode_inc_iversion(new_inode);
+ new_inode->i_ctime = current_time(new_inode);
+ err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ BTRFS_I(new_inode),
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (!err && new_inode->i_nlink == 0)
+ err = btrfs_orphan_add(trans, BTRFS_I(new_inode));
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ goto fail;
+ }
+ }
+
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -6528,7 +6580,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
- d_instantiate(dentry, inode);
+ if (new_inode) {
+ d_drop(dentry);
+ iput(inode);
+ } else {
+ d_instantiate(dentry, inode);
+ }
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
@@ -10519,7 +10576,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.lookup = btrfs_lookup,
.create = btrfs_create,
.unlink = btrfs_unlink,
- .link = btrfs_link,
+ .link2 = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
.rename = btrfs_rename2,
--
2.12.0
^ permalink raw reply related [flat|nested] 8+ messages in thread