From: Mark Fasheh <mfasheh@suse.de>
To: linux-btrfs@vger.kernel.org
Cc: Chris Mason <chris.mason@fusionio.com>,
Josef Bacik <josef@redhat.com>,
Gabriel de Perthuis <g2p.code@gmail.com>,
David Sterba <dsterba@suse.cz>, Mark Fasheh <mfasheh@suse.de>
Subject: [PATCH 4/4] btrfs: offline dedupe
Date: Tue, 21 May 2013 11:28:28 -0700 [thread overview]
Message-ID: <1369160908-26195-5-git-send-email-mfasheh@suse.de> (raw)
In-Reply-To: <1369160908-26195-1-git-send-email-mfasheh@suse.de>
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to
de-duplicate a list of extents across a range of files.
Internally, the ioctl re-uses code from the clone ioctl. This avoids
rewriting a large chunk of extent handling code.
Userspace passes in an array of file, offset pairs along with a length
argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison
of the user data before deduping the extent. Status and number of bytes
deduped are returned for each operation.
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
fs/btrfs/ioctl.c | 290 +++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/btrfs.h | 27 +++++
2 files changed, 317 insertions(+)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e90c519..54fcb90 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -57,6 +57,9 @@
#include "send.h"
#include "dev-replace.h"
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
{
@@ -2456,6 +2459,61 @@ out:
return ret;
}
+static noinline int fill_data(struct inode *inode, u64 off, u64 len,
+ char **cur_buffer)
+{
+ struct page *page;
+ void *addr;
+ char *buffer;
+ pgoff_t index;
+ pgoff_t last_index;
+ int ret = 0;
+ int bytes_copied = 0;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+ buffer = kmalloc(len, GFP_NOFS);
+ if (!buffer)
+ return -ENOMEM;
+
+ index = off >> PAGE_CACHE_SHIFT;
+ last_index = (off + len - 1) >> PAGE_CACHE_SHIFT;
+
+ while (index <= last_index) {
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!PageUptodate(page)) {
+ extent_read_full_page_nolock(tree, page,
+ btrfs_get_extent, 0);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ addr = kmap(page);
+ memcpy(buffer + bytes_copied, addr, PAGE_CACHE_SIZE);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ bytes_copied += PAGE_CACHE_SIZE;
+ index++;
+ }
+
+ *cur_buffer = buffer;
+
+out:
+ if (ret)
+ kfree(buffer);
+ return ret;
+}
+
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
{
/* do any pending delalloc/csum calc on src, one way or
@@ -2476,6 +2534,236 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
}
}
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+ mutex_unlock(&inode1->i_mutex);
+ mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ lock_extent_range(inode1, loff1, len);
+ lock_extent_range(inode2, loff2, len);
+ } else {
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+ lock_extent_range(inode2, loff2, len);
+ lock_extent_range(inode1, loff1, len);
+ }
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ char *orig_buffer = NULL;
+ char *dst_inode_buffer = NULL;
+ int ret;
+
+ /*
+ * btrfs_clone() can't handle extents in the same file
+ * yet. Once that works, we can drop this check and replace it
+ * with a check for the same inode, but overlapping extents.
+ */
+ if (src == dst)
+ return -EINVAL;
+
+ btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+ ret = fill_data(src, loff, len, &orig_buffer);
+ if (ret) {
+ printk(KERN_ERR "btrfs: unable to source populate data "
+ "buffer.\n");
+ goto out;
+ }
+
+ ret = fill_data(dst, dst_loff, len, &dst_inode_buffer);
+ if (ret) {
+ printk(KERN_ERR "btrfs: unable to populate destination data "
+ "buffer.\n");
+ goto out;
+ }
+
+ ret = memcmp(orig_buffer, dst_inode_buffer, len);
+ if (ret) {
+ ret = BTRFS_SAME_DATA_DIFFERS;
+ printk(KERN_ERR "btrfs: data for inode %lu does not "
+ "match\n", dst->i_ino);
+ goto out;
+ }
+
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out:
+ btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+ kfree(dst_inode_buffer);
+ kfree(orig_buffer);
+ return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+#define BTRFS_ONE_DEDUPE_LEN (1 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_same_args *args;
+ struct btrfs_ioctl_same_args tmp;
+ struct btrfs_ioctl_same_extent_info *info;
+ struct inode *src = file->f_dentry->d_inode;
+ struct file *dst_file = NULL;
+ struct inode *dst;
+ u64 off;
+ u64 len;
+ int args_size;
+ int i;
+ int ret;
+ u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+
+ if (copy_from_user(&tmp,
+ (struct btrfs_ioctl_same_args __user *)argp,
+ sizeof(tmp)))
+ return -EFAULT;
+
+ args_size = sizeof(tmp) + (tmp.dest_count *
+ sizeof(struct btrfs_ioctl_same_extent_info));
+
+ /* Keep size of ioctl argument sane */
+ if (args_size > PAGE_CACHE_SIZE)
+ return -E2BIG;
+
+ args = kmalloc(args_size, GFP_NOFS);
+ if (!args)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(args,
+ (struct btrfs_ioctl_same_args __user *)argp,
+ args_size))
+ goto out;
+ /* Make sure args didn't change magically between copies. */
+ if (memcmp(&tmp, args, sizeof(tmp)))
+ goto out;
+
+ if ((sizeof(tmp) + (sizeof(*info) * args->dest_count)) > args_size)
+ goto out;
+
+ /* pre-format 'out' fields to sane default values */
+ for (i = 0; i < args->dest_count; i++) {
+ info = &args->info[i];
+ info->bytes_deduped = 0;
+ info->status = 0;
+ }
+
+ off = args->logical_offset;
+ len = args->length;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the entire ioctl to something sane.
+ */
+ if (len > BTRFS_MAX_DEDUPE_LEN)
+ len = BTRFS_MAX_DEDUPE_LEN;
+
+ ret = -EINVAL;
+ if (off + len > src->i_size || off + len < off)
+ goto out;
+ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+ goto out;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = 0;
+ for (i = 0; i < args->dest_count; i++) {
+ u64 dest_off;
+ u64 src_off;
+ u64 op_len;
+
+ info = &args->info[i];
+
+ dst_file = fget(info->fd);
+ if (!dst_file) {
+ printk(KERN_ERR "btrfs: invalid fd %lld\n", info->fd);
+ info->status = -EBADF;
+ continue;
+ }
+
+ dst = dst_file->f_dentry->d_inode;
+ if (S_ISDIR(dst->i_mode)) {
+ printk(KERN_ERR "btrfs: file is dir %lld\n", info->fd);
+ info->status = -EISDIR;
+ goto next;
+ }
+
+ info->status = -EINVAL;
+ if (dst == src) {
+ printk(KERN_ERR "btrfs: file dup %lld\n", info->fd);
+ goto next;
+ }
+
+ dest_off = info->logical_offset;
+
+ if (dest_off + len > dst->i_size || dest_off + len < dest_off)
+ goto next;
+ if (!IS_ALIGNED(dest_off, bs))
+ goto next;
+
+ /*
+ * The purpose of this loop is to limit the number of
+ * bytes we dedupe during a single call to
+ * btrfs_extent_same().
+ *
+ * In order to memcmp the data we have to allocate a
+ * pair of buffers. We don't want to allocate too
+ * large a buffer, so limiting the size for each
+ * dedupe is an easy way to do this.
+ */
+ src_off = off;
+ op_len = len;
+ while (op_len) {
+ u64 tmp_len;
+
+ tmp_len = op_len;
+ if (op_len > BTRFS_ONE_DEDUPE_LEN)
+ tmp_len = BTRFS_ONE_DEDUPE_LEN;
+
+ info->status = btrfs_extent_same(src, src_off, tmp_len,
+ dst, dest_off);
+ if (info->status == 0) {
+ info->bytes_deduped += tmp_len;
+ } else
+ break;
+
+ dest_off += tmp_len;
+ src_off += tmp_len;
+ op_len -= tmp_len;
+ }
+
+next:
+ fput(dst_file);
+ dst_file = NULL;
+ }
+
+ if (copy_to_user(argp, args, args_size))
+ ret = -EFAULT;
+
+out:
+ if (dst_file)
+ fput(dst_file);
+ kfree(args);
+ return ret;
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -4151,6 +4439,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
+ case BTRFS_IOC_FILE_EXTENT_SAME:
+ return btrfs_ioctl_file_extent_same(file, argp);
}
return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index fa3a5f9..5465bc2 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -305,6 +305,31 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_DEFRAG_RANGE_COMPRESS 1
#define BTRFS_DEFRAG_RANGE_START_IO 2
+#define BTRFS_SAME_DATA_DIFFERS 1
+/* For extent-same ioctl */
+struct btrfs_ioctl_same_extent_info {
+ __s64 fd; /* in - destination file */
+ __u64 logical_offset; /* in - start of extent in destination */
+ __u64 bytes_deduped; /* out - total # of bytes we were able
+ * to dedupe from this file */
+ /* status of this dedupe operation:
+ * 0 if dedup succeeds
+ * < 0 for error
+ * == BTRFS_SAME_DATA_DIFFERS if data differs
+ */
+ __s32 status; /* out - see above description */
+ __u32 reserved;
+};
+
+struct btrfs_ioctl_same_args {
+ __u64 logical_offset; /* in - start of extent in source */
+ __u64 length; /* in - length of extent */
+ __u16 dest_count; /* in - total elements in info array */
+ __u16 reserved1;
+ __u32 reserved2;
+ struct btrfs_ioctl_same_extent_info info[0];
+};
+
struct btrfs_ioctl_space_info {
__u64 flags;
__u64 total_bytes;
@@ -510,5 +535,7 @@ struct btrfs_ioctl_send_args {
struct btrfs_ioctl_get_dev_stats)
#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
struct btrfs_ioctl_dev_replace_args)
+#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
+ struct btrfs_ioctl_same_args)
#endif /* _UAPI_LINUX_BTRFS_H */
--
1.8.1.4
next prev parent reply other threads:[~2013-05-21 18:29 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-21 18:28 [PATCH 0/4] btrfs: offline dedupe v1 Mark Fasheh
2013-05-21 18:28 ` [PATCH 1/4] btrfs: abtract out range locking in clone ioctl() Mark Fasheh
2013-05-21 18:28 ` [PATCH 2/4] btrfs_ioctl_clone: Move clone code into it's own function Mark Fasheh
2013-05-21 18:28 ` [PATCH 3/4] btrfs: Introduce extent_read_full_page_nolock() Mark Fasheh
2013-05-21 18:28 ` Mark Fasheh [this message]
2013-05-24 14:05 ` [PATCH 4/4] btrfs: offline dedupe David Sterba
2013-05-24 18:17 ` Mark Fasheh
2013-05-24 19:50 ` Gabriel de Perthuis
2013-05-24 22:38 ` Mark Fasheh
2013-05-24 23:36 ` Gabriel de Perthuis
-- strict thread matches above, loose matches on Subject: below --
2013-08-06 18:42 [PATCH 0/4] btrfs: out-of-band (aka offline) dedupe v4 Mark Fasheh
2013-08-06 18:42 ` [PATCH 4/4] btrfs: offline dedupe Mark Fasheh
2013-08-06 19:11 ` Zach Brown
2013-07-26 16:30 [PATCH 0/4] btrfs: offline dedupe v3 Mark Fasheh
2013-07-26 16:30 ` [PATCH 4/4] btrfs: offline dedupe Mark Fasheh
2013-07-26 22:09 ` Zach Brown
2013-06-11 20:31 [PATCH 0/4] btrfs: offline dedupe v2 Mark Fasheh
2013-06-11 20:31 ` [PATCH 4/4] btrfs: offline dedupe Mark Fasheh
2013-07-15 20:55 ` Zach Brown
2013-07-17 0:14 ` Gabriel de Perthuis
2013-04-16 22:15 [PATCH 0/4] [RFC] " Mark Fasheh
2013-04-16 22:15 ` [PATCH 4/4] " Mark Fasheh
2013-05-06 12:36 ` David Sterba
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1369160908-26195-5-git-send-email-mfasheh@suse.de \
--to=mfasheh@suse.de \
--cc=chris.mason@fusionio.com \
--cc=dsterba@suse.cz \
--cc=g2p.code@gmail.com \
--cc=josef@redhat.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).