From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: david@fromorbit.com, darrick.wong@oracle.com
Cc: linux-fsdevel@vger.kernel.org, xfs@oss.sgi.com
Subject: [PATCH 52/58] xfs: emulate the btrfs dedupe extent same ioctl
Date: Tue, 06 Oct 2015 22:00:51 -0700 [thread overview]
Message-ID: <20151007050051.30457.45420.stgit@birch.djwong.org> (raw)
In-Reply-To: <20151007045443.30457.47038.stgit@birch.djwong.org>
Emulate the BTRFS_IOC_EXTENT_SAME ioctl. This operation is similar
to clone_range, but the kernel must confirm that the contents of the
two extents are identical before performing the reflink.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/xfs/libxfs/xfs_fs.h | 30 ++++++++++++
fs/xfs/xfs_ioctl.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++--
fs/xfs/xfs_ioctl32.c | 1
fs/xfs/xfs_reflink.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_reflink.h | 6 ++
5 files changed, 275 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2c8cd04..c63afd4 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -570,8 +570,38 @@ struct xfs_clone_args {
__u64 dest_offset;
};
+/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
+#define XFS_EXTENT_DATA_SAME 0
+#define XFS_EXTENT_DATA_DIFFERS 1
+
+/* from struct btrfs_ioctl_file_extent_same_info */
+struct xfs_extent_data_info {
+ __s64 fd; /* in - destination file */
+ __u64 logical_offset; /* in - start of extent in destination */
+ __u64 bytes_deduped; /* out - total # of bytes we were able
+ * to dedupe from this file */
+ /* status of this dedupe operation:
+ * 0 if dedup succeeds
+ * < 0 for error
+ * == XFS_SAME_DATA_DIFFERS if data differs
+ */
+ __s32 status; /* out - see above description */
+ __u32 reserved;
+};
+
+/* from struct btrfs_ioctl_file_extent_same_args */
+struct xfs_extent_data {
+ __u64 logical_offset; /* in - start of extent in source */
+ __u64 length; /* in - length of extent */
+ __u16 dest_count; /* in - total elements in info array */
+ __u16 reserved1;
+ __u32 reserved2;
+ struct xfs_extent_data_info info[0];
+};
+
#define XFS_IOC_CLONE _IOW (0x94, 9, int)
#define XFS_IOC_CLONE_RANGE _IOW (0x94, 13, struct xfs_clone_args)
+#define XFS_IOC_FILE_EXTENT_SAME _IOWR(0x94, 54, struct xfs_extent_data)
#ifndef HAVE_BBMACROS
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ce4812e..50ea19e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1541,7 +1541,8 @@ xfs_ioctl_reflink(
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- size_t len)
+ size_t len,
+ bool is_dedupe)
{
struct inode *inode_in;
struct inode *inode_out;
@@ -1550,6 +1551,7 @@ xfs_ioctl_reflink(
loff_t isize;
int same_inode;
loff_t blen;
+ unsigned int flags;
if (len == 0)
return 0;
@@ -1629,8 +1631,12 @@ xfs_ioctl_reflink(
if (ret)
goto out_unlock;
+ flags = 0;
+ if (is_dedupe)
+ flags |= XFS_REFLINK_DEDUPE;
+
ret = xfs_reflink(XFS_I(inode_in), pos_in, XFS_I(inode_out),
- pos_out, len);
+ pos_out, len, flags);
if (ret < 0)
goto out_unlock;
@@ -1652,6 +1658,112 @@ out_unlock:
return ret;
}
+#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+
+static long
+xfs_ioctl_file_extent_same(
+ struct file *file,
+ struct xfs_extent_data __user *argp)
+{
+ struct xfs_extent_data *same;
+ struct xfs_extent_data_info *info;
+ struct inode *src;
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ unsigned long size;
+ bool is_admin;
+ u16 count;
+
+ is_admin = capable(CAP_SYS_ADMIN);
+ src = file_inode(file);
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ if (get_user(count, &argp->dest_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size = offsetof(struct xfs_extent_data __user,
+ info[count]);
+
+ same = memdup_user(argp, size);
+
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
+ goto out;
+ }
+
+ off = same->logical_offset;
+ len = same->length;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > XFS_MAX_DEDUPE_LEN)
+ len = XFS_MAX_DEDUPE_LEN;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = -EACCES;
+ if (!S_ISREG(src->i_mode))
+ goto out;
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = 0;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_file = fdget(info->fd);
+
+ if (!dst_file.file) {
+ info->status = -EBADF;
+ continue;
+ }
+ dst = file_inode(dst_file.file);
+
+ trace_xfs_ioctl_file_extent_same(file_inode(file), off, len,
+ dst, info->logical_offset);
+
+ info->bytes_deduped = 0;
+ if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+ info->status = -EINVAL;
+ } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
+ info->status = -EISDIR;
+ } else if (!S_ISREG(dst->i_mode)) {
+ info->status = -EACCES;
+ } else {
+ info->status = xfs_ioctl_reflink(file, off,
+ dst_file.file,
+ info->logical_offset,
+ len, true);
+ if (info->status == -EBADE)
+ info->status = XFS_EXTENT_DATA_DIFFERS;
+ else if (info->status == 0)
+ info->bytes_deduped = len;
+ }
+ fdput(dst_file);
+ }
+
+ ret = copy_to_user(argp, same, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ return ret;
+}
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -1959,7 +2071,7 @@ xfs_file_ioctl(
trace_xfs_ioctl_clone(file_inode(src.file), file_inode(filp));
- error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL);
+ error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL, false);
fdput(src);
if (error > 0)
error = 0;
@@ -1984,7 +2096,8 @@ xfs_file_ioctl(
file_inode(filp), args.dest_offset);
error = xfs_ioctl_reflink(src.file, args.src_offset, filp,
- args.dest_offset, args.src_length);
+ args.dest_offset, args.src_length,
+ false);
fdput(src);
if (error > 0)
error = 0;
@@ -1992,6 +2105,9 @@ xfs_file_ioctl(
return error;
}
+ case XFS_IOC_FILE_EXTENT_SAME:
+ return xfs_ioctl_file_extent_same(filp, arg);
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 76d8729..575c292 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -560,6 +560,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_ERROR_CLEARALL:
case XFS_IOC_CLONE:
case XFS_IOC_CLONE_RANGE:
+ case XFS_IOC_FILE_EXTENT_SAME:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ac81b02..dee3556 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1386,6 +1386,103 @@ advloop:
}
#undef IMAPNEXT
+/*
+ * Read a page's worth of file data into the page cache.
+ */
+STATIC struct page *
+xfs_get_page(
+ struct inode *inode, /* inode */
+ xfs_off_t offset) /* where in the inode to read */
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_CACHE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ return NULL;
+ }
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+STATIC int
+xfs_compare_extents(
+ struct inode *src, /* first inode */
+ xfs_off_t srcoff, /* offset of first inode */
+ struct inode *dest, /* second inode */
+ xfs_off_t destoff, /* offset of second inode */
+ xfs_off_t len, /* length of data to compare */
+ bool *is_same) /* out: true if the contents match */
+{
+ xfs_off_t src_poff;
+ xfs_off_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ xfs_off_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_CACHE_SIZE - 1);
+ dest_poff = destoff & (PAGE_CACHE_SIZE - 1);
+ cmp_len = min(PAGE_CACHE_SIZE - src_poff,
+ PAGE_CACHE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ ASSERT(cmp_len > 0);
+
+ trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
+ XFS_I(dest), destoff);
+
+ src_page = xfs_get_page(src, srcoff);
+ if (!src_page)
+ goto out_error;
+ dest_page = xfs_get_page(dest, destoff);
+ if (!dest_page) {
+ page_cache_release(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(src_addr);
+ kunmap_atomic(dest_addr);
+ page_cache_release(src_page);
+ page_cache_release(dest_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
+ return error;
+}
+
/**
* xfs_reflink() - link a range of blocks from one inode to another
*
@@ -1394,6 +1491,7 @@ advloop:
* @dest: Inode to clone to
* @destoff: Offset within @inode to start clone
* @len: Original length, passed by user, of range to clone
+ * @flags: Flags to modify reflink's behavior
*/
int
xfs_reflink(
@@ -1401,12 +1499,14 @@ xfs_reflink(
xfs_off_t srcoff,
struct xfs_inode *dest,
xfs_off_t destoff,
- xfs_off_t len)
+ xfs_off_t len,
+ unsigned int flags)
{
struct xfs_mount *mp = src->i_mount;
xfs_fileoff_t sfsbno, dfsbno;
xfs_filblks_t fsblen;
int error;
+ bool is_same;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
@@ -1418,6 +1518,9 @@ xfs_reflink(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
return -EINVAL;
+ if (flags & ~XFS_REFLINK_ALL)
+ return -EINVAL;
+
trace_xfs_reflink_range(src, srcoff, len, dest, destoff);
/* Lock both files against IO */
@@ -1429,6 +1532,21 @@ xfs_reflink(
xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
}
+ /*
+ * Check that the extents are the same.
+ */
+ if (flags & XFS_REFLINK_DEDUPE) {
+ is_same = false;
+ error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
+ destoff, len, &is_same);
+ if (error)
+ goto out_error;
+ if (!is_same) {
+ error = -EBADE;
+ goto out_error;
+ }
+ }
+
error = set_inode_reflink_flag(src, dest);
if (error)
goto out_error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index b633824..c60a9bd 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -44,7 +44,11 @@ extern int xfs_reflink_finish_fork_buf(struct xfs_inode *ip, struct xfs_buf *bp,
xfs_fileoff_t fileoff, struct xfs_trans *tp, int write_error,
xfs_fsblock_t old_fsbno);
+#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */
+#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE)
+
extern int xfs_reflink(struct xfs_inode *src, xfs_off_t srcoff,
- struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len);
+ struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
+ unsigned int flags);
#endif /* __XFS_REFLINK_H */
next prev parent reply other threads:[~2015-10-07 5:00 UTC|newest]
Thread overview: 67+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-10-07 4:54 [RFCv3 00/58] xfs: add reverse-mapping, reflink, and dedupe support Darrick J. Wong
2015-10-07 4:54 ` [PATCH 01/58] libxfs: make xfs_alloc_fix_freelist non-static Darrick J. Wong
2015-10-07 4:54 ` [PATCH 02/58] xfs: fix log ticket type printing Darrick J. Wong
2015-10-07 4:55 ` [PATCH 03/58] xfs: introduce rmap btree definitions Darrick J. Wong
2015-10-07 4:55 ` [PATCH 04/58] xfs: add rmap btree stats infrastructure Darrick J. Wong
2015-10-07 4:55 ` [PATCH 05/58] xfs: rmap btree add more reserved blocks Darrick J. Wong
2015-10-07 4:55 ` [PATCH 06/58] xfs: add owner field to extent allocation and freeing Darrick J. Wong
2015-10-07 4:55 ` [PATCH 07/58] xfs: add extended " Darrick J. Wong
2015-10-07 4:55 ` [PATCH 08/58] xfs: introduce rmap extent operation stubs Darrick J. Wong
2015-10-07 4:55 ` [PATCH 09/58] xfs: extend rmap extent operation stubs to take full owner info Darrick J. Wong
2015-10-07 4:55 ` [PATCH 10/58] xfs: define the on-disk rmap btree format Darrick J. Wong
2015-10-07 4:55 ` [PATCH 11/58] xfs: enhance " Darrick J. Wong
2015-10-07 4:56 ` [PATCH 12/58] xfs: add rmap btree growfs support Darrick J. Wong
2015-10-07 4:56 ` [PATCH 13/58] xfs: enhance " Darrick J. Wong
2015-10-07 4:56 ` [PATCH 14/58] xfs: rmap btree transaction reservations Darrick J. Wong
2015-10-07 4:56 ` [PATCH 15/58] xfs: rmap btree requires more reserved free space Darrick J. Wong
2015-10-07 4:56 ` [PATCH 16/58] libxfs: fix min freelist length calculation Darrick J. Wong
2015-10-07 4:56 ` [PATCH 17/58] xfs: add rmap btree operations Darrick J. Wong
2015-10-07 4:57 ` [PATCH 18/58] xfs: enhance " Darrick J. Wong
2015-10-07 4:57 ` [PATCH 19/58] xfs: add an extent to the rmap btree Darrick J. Wong
2015-10-07 4:57 ` [PATCH 20/58] xfs: add tracepoints for the rmap-mirrors-bmbt functions Darrick J. Wong
2015-10-07 4:57 ` [PATCH 21/58] xfs: teach rmap_alloc how to deal with our larger rmap btree Darrick J. Wong
2015-10-07 4:57 ` [PATCH 22/58] xfs: remove an extent from the " Darrick J. Wong
2015-10-07 4:57 ` [PATCH 23/58] xfs: enhanced " Darrick J. Wong
2015-10-07 4:57 ` [PATCH 24/58] xfs: add rmap btree insert and delete helpers Darrick J. Wong
2015-10-07 4:57 ` [PATCH 25/58] xfs: bmap btree changes should update rmap btree Darrick J. Wong
2015-10-21 21:39 ` Darrick J. Wong
2015-10-07 4:57 ` [PATCH 26/58] xfs: add rmap btree geometry feature flag Darrick J. Wong
2015-10-07 4:58 ` [PATCH 27/58] xfs: add rmap btree block detection to log recovery Darrick J. Wong
2015-10-07 4:58 ` [PATCH 28/58] xfs: enable the rmap btree functionality Darrick J. Wong
2015-10-07 4:58 ` [PATCH 29/58] xfs: disable XFS_IOC_SWAPEXT when rmap btree is enabled Darrick J. Wong
2015-10-07 4:58 ` [PATCH 30/58] xfs: implement " Darrick J. Wong
2015-10-07 4:58 ` [PATCH 31/58] libxfs: refactor short btree block verification Darrick J. Wong
2015-10-07 4:58 ` [PATCH 32/58] xfs: don't update rmapbt when fixing agfl Darrick J. Wong
2015-10-07 4:58 ` [PATCH 33/58] xfs: introduce refcount btree definitions Darrick J. Wong
2015-10-07 4:58 ` [PATCH 34/58] xfs: add refcount btree stats infrastructure Darrick J. Wong
2015-10-07 4:58 ` [PATCH 35/58] xfs: refcount btree add more reserved blocks Darrick J. Wong
2015-10-07 4:59 ` [PATCH 36/58] xfs: define the on-disk refcount btree format Darrick J. Wong
2015-10-07 4:59 ` [PATCH 37/58] xfs: define tracepoints for refcount/reflink activities Darrick J. Wong
2015-10-07 4:59 ` [PATCH 38/58] xfs: add refcount btree support to growfs Darrick J. Wong
2015-10-07 4:59 ` [PATCH 39/58] xfs: add refcount btree operations Darrick J. Wong
2015-10-07 4:59 ` [PATCH 40/58] libxfs: adjust refcount of an extent of blocks in refcount btree Darrick J. Wong
2015-10-27 19:05 ` Darrick J. Wong
2015-10-30 20:56 ` Darrick J. Wong
2015-10-07 4:59 ` [PATCH 41/58] libxfs: adjust refcount when unmapping file blocks Darrick J. Wong
2015-10-07 4:59 ` [PATCH 42/58] xfs: add refcount btree block detection to log recovery Darrick J. Wong
2015-10-07 4:59 ` [PATCH 43/58] xfs: map an inode's offset to an exact physical block Darrick J. Wong
2015-10-07 4:59 ` [PATCH 44/58] xfs: add reflink feature flag to geometry Darrick J. Wong
2015-10-07 5:00 ` [PATCH 45/58] xfs: create a separate workqueue for copy-on-write activities Darrick J. Wong
2015-10-07 5:00 ` [PATCH 46/58] xfs: implement copy-on-write for reflinked blocks Darrick J. Wong
2015-10-07 5:00 ` [PATCH 47/58] xfs: handle directio " Darrick J. Wong
2015-10-07 5:00 ` [PATCH 48/58] xfs: copy-on-write reflinked blocks when zeroing ranges of blocks Darrick J. Wong
2015-10-21 21:17 ` Darrick J. Wong
2015-10-07 5:00 ` [PATCH 49/58] xfs: clear inode reflink flag when freeing blocks Darrick J. Wong
2015-10-07 5:00 ` [PATCH 50/58] xfs: reflink extents from one file to another Darrick J. Wong
2015-10-07 5:12 ` kbuild test robot
2015-10-07 5:00 ` [PATCH 51/58] xfs: add clone file and clone range ioctls Darrick J. Wong
2015-10-07 5:13 ` kbuild test robot
2015-10-07 6:46 ` kbuild test robot
2015-10-07 7:35 ` kbuild test robot
2015-10-07 5:00 ` Darrick J. Wong [this message]
2015-10-07 5:00 ` [PATCH 53/58] xfs: teach fiemap about reflink'd extents Darrick J. Wong
2015-10-07 5:01 ` [PATCH 54/58] xfs: swap inode reflink flags when swapping inode extents Darrick J. Wong
2015-10-07 5:01 ` [PATCH 55/58] vfs: add a FALLOC_FL_UNSHARE mode to fallocate to unshare a range of blocks Darrick J. Wong
2015-10-07 5:01 ` [PATCH 56/58] xfs: unshare a range of blocks via fallocate Darrick J. Wong
2015-10-07 5:01 ` [PATCH 57/58] xfs: support XFS_XFLAG_REFLINK (and FS_NOCOW_FL) on reflink filesystems Darrick J. Wong
2015-10-07 5:01 ` [PATCH 58/58] xfs: recognize the reflink feature bit Darrick J. Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20151007050051.30457.45420.stgit@birch.djwong.org \
--to=darrick.wong@oracle.com \
--cc=david@fromorbit.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).