[PATCH] btrfs-progs: dduper - BTRFS offline deduplication tool

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Lakshmipathi Ganapathi <lakshmipathi.ganapathi@collabora.co.uk>
To: linux-btrfs@vger.kernel.org, lakshmipathi.g@giis.co.in
Subject: [PATCH] btrfs-progs: dduper - BTRFS offline deduplication tool
Date: Fri, 24 Aug 2018 09:54:40 +0530	[thread overview]
Message-ID: <20180824042440.GA7793@giis.co.in> (raw)

dduper is an offline dedupe tool. It works by fetching checksum from BTRFS
csum tree, instead of reading whole file blocks and computing checksum.
This tool relies on output from 'btrfs inspect-internal dump-csum' command.

Signed-off-by: Lakshmipathi.G <lakshmipathi.ganapathi@collabora.co.uk>
---
 dduper | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 dduper

diff --git a/dduper b/dduper
new file mode 100644
index 0000000..2170b11
--- /dev/null
+++ b/dduper
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+
+""" dduper - BTRFS Dedupe tool.
+
+This is a offline dedupe tool. Instead of reading whole file blocks and
+computing checksum, It works by fetching checksum from BTRFS csum tree.
+This hugely improves the performance.
+
+Authors: Lakshmipathi.G <lakshmipathi.ganapathi@collabora.co.uk>
+"""
+import argparse
+import errno
+import hashlib
+import numpy as np
+import math
+import os
+import pdb
+import struct
+import subprocess
+import sys
+
+from collections import OrderedDict
+from fcntl import ioctl
+from itertools import combinations
+from itertools import izip_longest
+from stat import *
+#  4kb block size
+blk_size = 4
+#  no.of csum on single row - right now its 8
+no_of_chunks = 0
+FICLONERANGE = 0x4020940d
+
+device_name = None
+skip = 0
+chunk_sz = 0
+run_len = 0
+ele_sz = 0
+
+# Already deduped files
+processed_files = []
+
+
+# From https://stackoverflow.com/questions/434287
+def grouper(iterable, n, fillvalue=None):
+    args = [iter(iterable)] * n
+    return izip_longest(*args, fillvalue=fillvalue)
+
+
+def get_ele_size():
+
+    global no_of_chunks, run_len
+    if chunk_sz <= 0 or chunk_sz % 32 != 0:
+        print "Ensure chunk size is of multiple 32KB. (32,64,128 etc)"
+        sys.exit(-1)
+    no_of_chunks = chunk_sz / blk_size
+    ele_sz = no_of_chunks / 8
+    run_len = no_of_chunks * blk_size * 1024
+    return ele_sz
+
+
+def get_hashes(out1):
+    """
+       For each list item compute its hash and store it with offset as its key.
+    """
+    global ele_sz
+
+    if ele_sz == 1:
+        od = OrderedDict()
+        for idx, ele in enumerate(out1):
+            v = []
+            k = hashlib.md5(str(ele)).hexdigest()
+            v.append(idx)
+            if k in od:
+                print "Collison with: "+str(k) + "at offset: "+str(v)
+            od[k] = v
+    else:
+        od = OrderedDict()
+        for idx, ele in enumerate(grouper(out1, ele_sz, 'x')):
+            v = []
+            k = hashlib.md5(str(ele)).hexdigest()
+            v.append(idx)
+            if k in od:
+                print "Collison with: "+str(k) + "at offset: "+str(v)
+            od[k] = v
+
+    return od
+
+
+def ioctl_ficlonerange(dst_fd, s):
+
+    try:
+        ioctl(dst_fd, FICLONERANGE, s)
+    except Exception as e:
+        print "error({0})".format(e.errno)
+
+
+def cmp_files(file1, file2):
+
+    md1 = subprocess.Popen(['md5sum', file1], stdout=subprocess.PIPE,
+            close_fds=True).stdout.read().split(" ")[0]
+    md2 = subprocess.Popen(['md5sum', file2], stdout=subprocess.PIPE,
+            close_fds=True).stdout.read().split(" ")[0]
+    if md1 == md2:
+        return 0
+    else:
+        return 1
+
+
+def do_dedupe(src_file, dst_file, dry_run):
+
+    bkup_file = dst_file + ".__superduper"
+    src_fd = os.open(src_file, os.O_RDONLY)
+    dst_fd = os.open(dst_file, os.O_WRONLY)
+    perfect_match = 0
+
+    out1 = subprocess.Popen(['btrfs', 'inspect-internal', 'dump-csum', src_file, device_name],
+            stdout=subprocess.PIPE, close_fds=True).stdout.readlines()
+    out2 = subprocess.Popen(['btrfs', 'inspect-internal', 'dump-csum', dst_file, device_name],
+            stdout=subprocess.PIPE, close_fds=True).stdout.readlines()
+    # todo : perfect match files. Remove dst_file from further operations
+    if out1 == out2:
+        print "Prefect match : ", src_file, dst_file
+        perfect_match = 1
+
+    src_dict = get_hashes(out1)
+    dst_dict = get_hashes(out2)
+    total_entry = len(src_dict) - 1  # Fix missing final ele
+    file_size = os.path.getsize(src_file)
+
+    np1 = np.array([v for v in src_dict.keys()])
+    np2 = np.array([v for v in dst_dict.keys()])
+    matched_keys = np.intersect1d(np1, np2)
+    unmatched_keys = np.setdiff1d(np1, np2)
+
+    if dry_run == 0:
+        # todo: Clear dict/np/list if there are not used further
+        # todo : handle same content within single file
+
+        if matched_keys is not None:
+            if skip == 0:
+                bkup2 = subprocess.Popen(['cp', '--reflink=always', dst_file, bkup_file], stdout=subprocess.PIPE)
+            print "*" * 24
+            # print "matched regions"
+            for location in matched_keys:
+                    entry = src_dict[location][0]
+                    src_len = no_of_chunks * blk_size * 1024
+                    src_offset = src_dict[location][0] * src_len
+
+                    multi_dst_offsets = dst_dict[location]  # list
+                    for offset in multi_dst_offsets:
+                        dst_offset = offset * src_len
+
+                        if entry == total_entry:  # fix final ele
+                            src_len = file_size - src_offset
+                        # print "matching chunk : src offset:"+str(src_offset) +" src_len="+ str(src_len) +" dest_off="+ str(dst_offset)
+                        s = struct.pack("qQQQ", src_fd, src_offset, src_len, dst_offset)
+                        ioctl_ficlonerange(dst_fd, s)
+
+            print "Dedupe completed for " + src_file + ":" + dst_file
+            # Verify original unmodified file and newly deduped file both point to same contents
+            if skip == 0:
+                ret = cmp_files(dst_file, bkup_file)
+                if ret == 0:
+                    print "Dedupe validation successful " + src_file + ":" + dst_file
+                    # Removing temporary backup file path
+                    os.unlink(bkup_file)
+                else:
+                    print "WARNING: Dedupe for " + dst_file + " Resulted in corruption" + \
+                          " Backup file path " + bkup_file
+
+    # Close open fds
+    os.close(src_fd)
+    os.close(dst_fd)
+
+    print "Summary"
+    print "blk_size : %d chunksize : %d" % (blk_size, chunk_sz)
+    print src_file + " has " + str(len(src_dict)) + " chunks"
+    print dst_file + " has " + str(len(dst_dict)) + " chunks"
+    print "Matched chunks : " + str(len(matched_keys))
+    print "Unmatched chunks: " + str(len(unmatched_keys))
+    return perfect_match
+
+
+def validate_files(src_file, dst_file, processed_files):
+        global run_len
+        if src_file in processed_files:
+            return False
+        if dst_file in processed_files:
+            return False
+        src_stat = os.stat(src_file)
+        dst_stat = os.stat(dst_file)
+        # Verify its a unique regular file
+        if (S_ISREG(src_stat.st_mode) == S_ISREG(dst_stat.st_mode) and
+           (src_stat.st_ino != dst_stat.st_ino) and
+           (src_stat.st_size >= 4096) and
+           (dst_stat.st_size >= 4096) and
+           (src_stat.st_size >= run_len) and
+           (dst_stat.st_size >= run_len)):
+            return True
+        print "Skipped", src_file, dst_file, "not unique regular files or \
+               file size < 4kb or size < " + str(run_len)
+        return False
+
+
+def dedupe_files(file_list, dry_run):
+        ret = 0
+        global processed_files
+        if len(file_list) == 2:
+            src_file = file_list[0]
+            dst_file = file_list[1]
+            if validate_files(src_file, dst_file, processed_files) is True:
+                ret = do_dedupe(src_file, dst_file, dry_run)
+        elif len(file_list) > 2:
+            comb = combinations(file_list, 2)
+            for f in comb:
+                src_file = f[0]
+                dst_file = f[1]
+                if validate_files(src_file, dst_file, processed_files) is True:
+                   ret = do_dedupe(src_file, dst_file, dry_run)
+                   if ret == 1:
+                        # perfectly matching file found - stop re-use this file again.
+                        processed_files.append(dst_file)
+        else:
+            print "Single file given"
+            return
+
+
+def validate_file(filename):
+        global run_len
+        file_stat = os.stat(filename)
+        # Verify its a unique regular file
+        if (S_ISREG(file_stat.st_mode) and (file_stat.st_size >= 4096) and
+        (file_stat.st_size >= run_len)):
+            return True
+        else:
+            filelist.remove(f)
+            print "Skipped", f, "not unique regular files or \
+            file size < 4kb or size < " + str(run_len)
+        return False
+
+
+def dedupe_dir(dir_path, dry_run, recurse):
+        file_list = []
+        if recurse == 1:
+            for path, dirs, files in os.walk(dir_path):
+                for filename in files:
+                    fn = os.path.join(path, filename)
+                    if validate_file(fn) is True:
+                        file_list.append(fn)
+        else:
+            for fi in os.listdir(dir_path):
+                if os.path.isfile(os.path.join(dir_path, fi)):
+                    fn = os.path.join(dir_path, fi)
+                    if validate_file(fn) is True:
+                        file_list.append(fn)
+        dedupe_files(file_list, dry_run)
+
+
+def main(results):
+
+    if results.file_list is not None:
+        dedupe_files(results.file_list, results.dry_run)
+
+    if results.dir_path is not None:
+        dedupe_dir(results.dir_path, results.dry_run, results.recurse)
+
+    return
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-p', '--device', action='store', dest='device_name',
+                type=str, help='Device with BTRFS partition (ex: /dev/sda3) ',
+                required=True)
+
+    single = parser.add_mutually_exclusive_group()
+
+    single.add_argument('-d', '--dir', action='store', dest='dir_path',
+                type=str, help='Dedupe Given directory', required=False)
+
+    single.add_argument('-f', '--files', action='store', dest='file_list',
+                nargs='+', help='Dedupe List of files', type=str, required=False)
+
+    parser.add_argument('-r', '--recurse', action='store_true', dest='recurse',
+                help='Parse dir recursively (used along with -d)')
+
+    parser.add_argument('-D', '--dry-run', action='store_true', dest='dry_run',
+                help='Show summary of dedupe details')
+
+    parser.add_argument('-s', '--skip', action='store_true', dest='skip',
+                help='Will skip backup/validation process.')
+
+    parser.add_argument('-c', '--chunk-size', action='store', dest='chunk_sz',
+                type=int, default=32, help='Dedupe chunk size in KB', required=False)
+
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01',
+                 help="Show version info")
+
+    results = parser.parse_args()
+
+    if not (results.dir_path or results.file_list):
+            parser.error('No action requested, add --files or --dir')
+
+    device_name = results.device_name
+    skip = results.skip
+    chunk_sz = results.chunk_sz
+    ele_sz = get_ele_size()
+
+    main(results)
-- 
2.7.4

                 reply	other threads:[~2018-08-24  7:58 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2170b11 )
 OR (
bs:"[PATCH] btrfs-progs: dduper - BTRFS offline deduplication tool" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180824042440.GA7793@giis.co.in \
    --to=lakshmipathi.ganapathi@collabora.co.uk \
    --cc=lakshmipathi.g@giis.co.in \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).