#! /usr/bin/perl use strict; use warnings; use BerkeleyDB; use DBI; use Data::Dumper; use Digest::SHA1; use Fcntl; $| = 1; my $env = new BerkeleyDB::Env({ -Home => $ENV{'HOME'} . "/work/bulk", }); my @blocksizes = (qw/16384 8192 4096 1024 512/); #my @blocksizes = (qw/16384/); while (my $device = shift) { my $device_size = `env POSIXLY_CORRECT=1 df $device | tail -1 | awk '{printf "%d", \$1}'`; if ($device_size <= 0) { die "Unable to find device block count through df, make sure you pass /dev/mapper/devicename for logical volumes.\n"; } foreach my $blocksize (sort {$a <=> $b} @blocksizes) { my $device_blocks = ($device_size * 512) / $blocksize; print "Testing \`$device': $blocksize bytes, $device_blocks blocks\n"; my $device_sanitized = $device; $device_sanitized =~ s@/@_@g; $device_sanitized =~ s/[^-a-zA-Z0-9_]//g; my $data; my $percent_done = 0; my $blocks = 0; my %hashes; unlink("$device_sanitized.db"); my $db = tie %hashes, 'BerkeleyDB::Hash', -Filename => "$device_sanitized.db", -Cachesize => 2**26, -Flags => DB_CREATE; my $time_start = time(); sysopen(DEVICE, $device, O_RDONLY) or die; while ((my $bytes_read = sysread(DEVICE, $data, 2**20)) >= $blocksize) { # Read 1M at a time for (my $i = 0; $i < $bytes_read / $blocksize; $i++) { my $substr = substr($data, $i * $blocksize, $blocksize); #printf("Block %d, i = %d, offset = %d\n", # $blocks, $i, $i * $blocksize); my $digest = Digest::SHA1->sha1($substr); $hashes{$digest}++; $blocks++; my $pdone = int(100 * ($blocks / $device_blocks)); if ($pdone != $percent_done) { printf STDERR ("Done: %d%% %d/%d, %d seconds\n", $pdone, $blocks, $device_blocks, time() - $time_start); $percent_done = $pdone; $time_start = time(); } } } close(DEVICE); my $blocks_saved = 0.0; while (my ($key, $value) = each(%hashes)) { if ($value > 1) { $blocks_saved += $value; } } my $saved = $blocks_saved * $blocksize; my $total_size = $blocks * $blocksize; printf("%s: %5d byte blocks, %d blocks\n", $device, $blocksize); printf("\t%d duplicate blocks, %.0f duplicate bytes.\n", $blocks, $blocks_saved, $saved); printf("\t%.2f%% overhead, %.2f%% duplicate.\n", (20.0 / $blocksize) * 100, ($saved / ($total_size + $saved)) * 100); } }