From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: darrick.wong@oracle.com
Cc: linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-api@vger.kernel.org
Subject: [PATCH 03/18] vfs: introduce new file extent swap ioctl
Date: Tue, 28 Apr 2020 19:44:33 -0700 [thread overview]
Message-ID: <158812827320.168506.17255602633619684843.stgit@magnolia> (raw)
In-Reply-To: <158812825316.168506.932540609191384366.stgit@magnolia>
From: Darrick J. Wong <darrick.wong@oracle.com>
Introduce a new ioctl to handle swapping extents between two files.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/ioctl.c | 32 ++++++++
fs/read_write.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/libxfs/xfs_fs.h | 1
include/linux/fs.h | 15 ++++
include/uapi/linux/fs.h | 55 ++++++++++++++
mm/filemap.c | 77 +++++++++++++++++++
6 files changed, 367 insertions(+), 1 deletion(-)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 282d45be6f45..f564e6f2fad5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -268,6 +268,35 @@ static long ioctl_file_clone_range(struct file *file,
args.src_length, args.dest_offset);
}
+static long ioctl_file_swap_range(struct file *file2,
+ struct file_swap_range __user *argp)
+{
+ struct file_swap_range args;
+ struct fd file1;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ file1 = fdget(args.file1_fd);
+ if (!file1.file)
+ return -EBADF;
+
+ ret = -EXDEV;
+ if (file1.file->f_path.mnt != file2->f_path.mnt)
+ goto fdput;
+
+ ret = vfs_swap_file_range(file1.file, file2, &args);
+ if (ret)
+ goto fdput;
+
+ if (copy_to_user(argp, &args, sizeof(args)))
+ ret = -EFAULT;
+fdput:
+ fdput(file1);
+ return ret;
+}
+
#ifdef CONFIG_BLOCK
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -730,6 +759,9 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
case FIDEDUPERANGE:
return ioctl_file_dedupe_range(filp, argp);
+ case FISWAPRANGE:
+ return ioctl_file_swap_range(filp, argp);
+
case FIONREAD:
if (!S_ISREG(inode->i_mode))
return vfs_ioctl(filp, cmd, arg);
diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..2b5116f129de 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2081,6 +2081,92 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
+/*
+ * Check that the two inodes are eligible for range swapping, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the inodes
+ * have been locked against any other modifications.
+ */
+int generic_swap_file_range_prep(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr)
+{
+ struct inode *inode1 = file_inode(file1);
+ struct inode *inode2 = file_inode(file2);
+ u64 blkmask = i_blocksize(inode1) - 1;
+ bool same_inode = (inode1 == inode2);
+ int ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode2))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
+ return -ETXTBSY;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+ return -EINVAL;
+
+ /* Ranges cannot start after EOF. */
+ if (fsr->file1_offset > i_size_read(inode1) ||
+ fsr->file2_offset > i_size_read(inode2))
+ return -EINVAL;
+
+ /*
+ * If the caller said to swap to EOF, we set the length of the request
+ * large enough to cover everything to the end of both files.
+ */
+ if (fsr->flags & FILE_SWAP_RANGE_TO_EOF)
+ fsr->length = max_t(int64_t,
+ i_size_read(inode1) - fsr->file1_offset,
+ i_size_read(inode2) - fsr->file2_offset);
+
+ /* Zero length swapext exits immediately. */
+ if (fsr->length == 0)
+ return 0;
+
+ /* Check that we don't violate system file offset limits. */
+ ret = generic_swap_file_range_checks(file1, file2, fsr);
+ if (ret)
+ return ret;
+
+ /*
+ * Ensure that we don't swap a partial EOF block into the middle of
+ * another file.
+ */
+ if (fsr->length & blkmask) {
+ loff_t new_length = fsr->length;
+
+ if (fsr->file2_offset + new_length < i_size_read(inode2))
+ new_length &= ~blkmask;
+
+ if (fsr->file1_offset + new_length < i_size_read(inode1))
+ new_length &= ~blkmask;
+
+ if (new_length != fsr->length)
+ return -EINVAL;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode1);
+ if (!same_inode)
+ inode_dio_wait(inode2);
+
+ ret = filemap_write_and_wait_range(inode1->i_mapping, fsr->file1_offset,
+ fsr->file1_offset + fsr->length - 1);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait_range(inode2->i_mapping, fsr->file2_offset,
+ fsr->file2_offset + fsr->length - 1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(generic_swap_file_range_prep);
+
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
@@ -2278,3 +2364,105 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
return ret;
}
EXPORT_SYMBOL(vfs_dedupe_file_range);
+
+/*
+ * Check that both files' metadata agree with the snapshot that we took for
+ * the range swap request.
+
+ * This should be called after the filesystem has locked /all/ inode metadata
+ * against modification.
+ */
+int generic_swap_file_range_check_fresh(struct inode *inode1,
+ struct inode *inode2,
+ const struct file_swap_range *fsr)
+{
+ /* Check that the offset/length values cover all of both files */
+ if ((fsr->flags & FILE_SWAP_RANGE_FULL_FILES) &&
+ (fsr->file1_offset != 0 ||
+ fsr->file2_offset != 0 ||
+ fsr->length != i_size_read(inode1) ||
+ fsr->length != i_size_read(inode2)))
+ return -EDOM;
+
+ /* Check that file2 hasn't otherwise been modified. */
+ if ((fsr->flags & FILE_SWAP_RANGE_FILE2_FRESH) &&
+ (fsr->file2_ino != inode2->i_ino ||
+ fsr->file2_ctime != inode2->i_ctime.tv_sec ||
+ fsr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
+ fsr->file2_mtime != inode2->i_mtime.tv_sec ||
+ fsr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
+ return -EBUSY;
+
+ return 0;
+}
+EXPORT_SYMBOL(generic_swap_file_range_check_fresh);
+
+static inline int swap_range_verify_area(struct file *file, loff_t pos,
+ struct file_swap_range *fsr)
+{
+ int64_t len = fsr->length;
+
+ if (fsr->flags & FILE_SWAP_RANGE_TO_EOF)
+ len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
+ return remap_verify_area(file, pos, len, true);
+}
+
+int do_swap_file_range(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr)
+{
+ int ret;
+
+ if ((fsr->flags & ~FILE_SWAP_RANGE_ALL_FLAGS) ||
+ memchr_inv(&fsr->pad, 0, sizeof(fsr->pad)))
+ return -EINVAL;
+
+ if ((fsr->flags & FILE_SWAP_RANGE_FULL_FILES) &&
+ (fsr->flags & FILE_SWAP_RANGE_TO_EOF))
+ return -EINVAL;
+
+ /*
+ * FISWAPRANGE ioctl enforces that src and dest files are on the same
+ * mount. Practically, they only need to be on the same file system.
+ */
+ if (file_inode(file1)->i_sb != file_inode(file2)->i_sb)
+ return -EXDEV;
+
+ ret = generic_file_rw_checks(file1, file2);
+ if (ret < 0)
+ return ret;
+
+ if (!file1->f_op->swap_file_range)
+ return -EOPNOTSUPP;
+
+ ret = swap_range_verify_area(file1, fsr->file1_offset, fsr);
+ if (ret)
+ return ret;
+
+ ret = swap_range_verify_area(file2, fsr->file2_offset, fsr);
+ if (ret)
+ return ret;
+
+ ret = file2->f_op->swap_file_range(file1, file2, fsr);
+ if (ret)
+ return ret;
+
+ file_modified(file1);
+ file_modified(file2);
+ fsnotify_modify(file1);
+ fsnotify_modify(file2);
+ return ret;
+}
+EXPORT_SYMBOL(do_swap_file_range);
+
+int vfs_swap_file_range(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr)
+{
+ int ret;
+
+ file_start_write(file2);
+ ret = do_swap_file_range(file1, file2, fsr);
+ file_end_write(file2);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_swap_file_range);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18054120074e..c5b75082b9db 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -844,6 +844,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+/* FISWAPRANGE ---------------- hoisted 129 */
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6f59b4f22a..63acc11d0804 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1862,6 +1862,8 @@ struct file_operations {
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
+ int (*swap_file_range)(struct file *file_in, struct file *file_out,
+ struct file_swap_range *fsr);
int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;
@@ -1931,6 +1933,8 @@ extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count,
unsigned int remap_flags);
+extern int generic_swap_file_range_prep(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
@@ -1942,7 +1946,13 @@ extern int vfs_dedupe_file_range(struct file *file,
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags);
-
+extern int do_swap_file_range(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr);
+extern int vfs_swap_file_range(struct file *file1, struct file *file2,
+ struct file_swap_range *fsr);
+extern int generic_swap_file_range_check_fresh(struct inode *inode1,
+ struct inode *inode2,
+ const struct file_swap_range *fsr);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
@@ -3120,6 +3130,9 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count, unsigned int remap_flags);
+extern int generic_swap_file_range_checks(struct file *file1,
+ struct file *file2,
+ const struct file_swap_range *fsr);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 379a612f8f1d..a74b49b02e75 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -93,6 +93,60 @@ struct file_dedupe_range {
struct file_dedupe_range_info info[0];
};
+/*
+ * Swap part of file1 with part of the file that this ioctl that is being
+ * called against (which we'll call file2). Filesystems must be able to
+ * complete the operation even if the system goes down.
+ */
+struct file_swap_range {
+ __s64 file1_fd;
+ __s64 file1_offset; /* file1 offset, bytes */
+ __s64 file2_offset; /* file2 offset, bytes */
+ __s64 length; /* bytes to swap */
+
+ __u64 flags; /* see FILE_SWAP_RANGE_* below */
+
+ /* file2 metadata for optional freshness checks */
+ __s64 file2_ino; /* inode number */
+ __s64 file2_mtime; /* modification time */
+ __s64 file2_ctime; /* change time */
+ __s32 file2_mtime_nsec; /* mod time, nsec */
+ __s32 file2_ctime_nsec; /* change time, nsec */
+
+ __u64 pad[6]; /* must be zeroes */
+};
+
+/*
+ * Atomic swap operations are not required. This relaxes the requirement that
+ * the filesystem must be able to complete the operation after a crash.
+ */
+#define FILE_SWAP_RANGE_NONATOMIC (1 << 0)
+
+/*
+ * Check that file2's inode number, mtime, and ctime against the values
+ * provided, and return -EBUSY if there isn't an exact match.
+ */
+#define FILE_SWAP_RANGE_FILE2_FRESH (1 << 1)
+
+/*
+ * Check that the file1's length is equal to file1_offset + length, and that
+ * file2's length is equal to file2_offset + length. Returns -EDOM if there
+ * isn't an exact match.
+ */
+#define FILE_SWAP_RANGE_FULL_FILES (1 << 2)
+
+/*
+ * Swap file data all the way to the ends of both files, and then swap the file
+ * sizes. This flag can be used to replace a file's contents with a different
+ * amount of data. length will be ignored.
+ */
+#define FILE_SWAP_RANGE_TO_EOF (1 << 3)
+
+#define FILE_SWAP_RANGE_ALL_FLAGS (FILE_SWAP_RANGE_NONATOMIC | \
+ FILE_SWAP_RANGE_FILE2_FRESH | \
+ FILE_SWAP_RANGE_FULL_FILES | \
+ FILE_SWAP_RANGE_TO_EOF)
+
/* And dynamically-tunable limits and defaults: */
struct files_stat_struct {
unsigned long nr_files; /* read only */
@@ -198,6 +252,7 @@ struct fsxattr {
#define FICLONE _IOW(0x94, 9, int)
#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range)
+#define FISWAPRANGE _IOWR('X', 129, struct file_swap_range)
#define FSLABEL_MAX 256 /* Max chars for the interface; each fs may differ */
diff --git a/mm/filemap.c b/mm/filemap.c
index 23a051a7ef0f..e21b63654767 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3035,6 +3035,83 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in,
return 0;
}
+/* Performs necessary checks before doing a range swap. */
+int generic_swap_file_range_checks(struct file *file1, struct file *file2,
+ const struct file_swap_range *fsr)
+{
+ struct inode *inode1 = file1->f_mapping->host;
+ struct inode *inode2 = file2->f_mapping->host;
+ int64_t test_len;
+ uint64_t blen;
+ loff_t size1, size2;
+ loff_t bs = inode2->i_sb->s_blocksize;
+ int ret;
+
+ if (fsr->length < 0)
+ return -EINVAL;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(fsr->file1_offset, bs) ||
+ !IS_ALIGNED(fsr->file2_offset, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (fsr->file1_offset + fsr->length < fsr->file1_offset ||
+ fsr->file2_offset + fsr->length < fsr->file2_offset)
+ return -EINVAL;
+
+ size1 = i_size_read(inode1);
+ size2 = i_size_read(inode2);
+
+ /*
+ * Swapext require both ranges to be within EOF, unless we're swapping
+ * to EOF. generic_swap_range_prep already checked that both
+ * fsr->file1_offset and fsr->file2_offset are within EOF.
+ */
+ if (!(fsr->flags & FILE_SWAP_RANGE_TO_EOF) &&
+ (fsr->file1_offset + fsr->length > size1 ||
+ fsr->file2_offset + fsr->length > size2))
+ return -EINVAL;
+
+ /*
+ * Make sure we don't hit any file size limits. If we hit any size
+ * limits such that test_length was adjusted, we abort the whole
+ * operation.
+ */
+ test_len = fsr->length;
+ ret = generic_write_check_limits(file2, fsr->file2_offset, &test_len);
+ if (ret)
+ return ret;
+ ret = generic_write_check_limits(file1, fsr->file1_offset, &test_len);
+ if (ret)
+ return ret;
+ if (test_len != fsr->length)
+ return -EINVAL;
+
+ /*
+ * If the user wanted us to swap to the infile's EOF, round up to the
+ * next block boundary for this check. Do the same for the outfile.
+ *
+ * Otherwise, reject the range length if it's not block aligned. We
+ * already confirmed the starting offsets' block alignment.
+ */
+ if (fsr->file1_offset + fsr->length == size1)
+ blen = ALIGN(size1, bs) - fsr->file1_offset;
+ else if (fsr->file2_offset + fsr->length == size2)
+ blen = ALIGN(size2, bs) - fsr->file2_offset;
+ else if (!IS_ALIGNED(fsr->length, bs))
+ return -EINVAL;
+ else
+ blen = fsr->length;
+
+ /* Don't allow overlapped swapping within the same file. */
+ if (inode1 == inode2 &&
+ fsr->file2_offset + blen > fsr->file1_offset &&
+ fsr->file1_offset + blen > fsr->file2_offset)
+ return -EINVAL;
+
+ return 0;
+}
/*
* Performs common checks before doing a file copy/clone
next prev parent reply other threads:[~2020-04-29 2:44 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-29 2:44 [PATCH RFC 00/18] xfs: atomic file updates Darrick J. Wong
2020-04-29 2:44 ` [PATCH 01/18] xfs: clean up the error handling in xfs_swap_extent_rmap Darrick J. Wong
2020-04-29 2:44 ` [PATCH 02/18] xfs: fix xfs_reflink_remap_prep calling conventions Darrick J. Wong
2020-05-01 22:54 ` Allison Collins
2020-04-29 2:44 ` Darrick J. Wong [this message]
2020-04-29 2:44 ` [PATCH 04/18] xfs: support deferred bmap updates on the attr fork Darrick J. Wong
2020-04-29 2:44 ` [PATCH 05/18] xfs: xfs_bmap_finish_one should map unwritten extents properly Darrick J. Wong
2020-04-29 2:44 ` [PATCH 06/18] xfs: create a log incompat flag for atomic extent swapping Darrick J. Wong
2020-04-29 2:45 ` [PATCH 07/18] xfs: allow deferred ops items to put themselves at the end of the pending queue Darrick J. Wong
2020-04-29 2:45 ` [PATCH 08/18] xfs: introduce a swap-extent log intent item Darrick J. Wong
2020-04-29 2:45 ` [PATCH 09/18] xfs: create deferred log items for extent swapping Darrick J. Wong
2020-04-29 2:45 ` [PATCH 10/18] xfs: refactor locking and unlocking two inodes against userspace IO Darrick J. Wong
2020-04-29 2:45 ` [PATCH 11/18] xfs: add a ->swap_file_range handler Darrick J. Wong
2020-04-29 2:45 ` [PATCH 12/18] xfs: add error injection to test swapext recovery Darrick J. Wong
2020-04-29 2:45 ` [PATCH 13/18] xfs: allow xfs_swap_range to use older extent swap algorithms Darrick J. Wong
2020-04-29 2:45 ` [PATCH 14/18] xfs: port xfs_swap_extents_rmap to our new code Darrick J. Wong
2020-04-29 2:45 ` [PATCH 15/18] xfs: consolidate all of the xfs_swap_extent_forks code Darrick J. Wong
2020-04-29 2:45 ` [PATCH 16/18] xfs: refactor reflink flag handling in xfs_swap_extent_forks Darrick J. Wong
2020-04-29 2:46 ` [PATCH 17/18] xfs: remove old swap extents implementation Darrick J. Wong
2020-04-29 2:46 ` [PATCH 18/18] xfs: fix quota accounting in the old fork swap code Darrick J. Wong
2020-05-01 19:46 ` [PATCH RFC 00/18] xfs: atomic file updates Jann Horn
2020-05-01 20:11 ` Darrick J. Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=158812827320.168506.17255602633619684843.stgit@magnolia \
--to=darrick.wong@oracle.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox