#! /usr/bin/python2 # # Locate 4k fragments of a subject file in one or more other files or # devices. Only reports two or more consecutive matches. # # Usage: # findHash.py /path/to/subject/file /dev/sdx|/path/to/image/file [/dev/sdy ...] import hashlib, sys, datetime # Read the known file 4k at a time, building a dictionary of # md5 hashes vs. offset. Use a large buffer for speed. # Drops any partial block at the end of the file. d = {} pos = long(0) f = open(sys.argv[1], 'r', 1<<20) b = f.read(4096) while len(b)==4096: md5 = hashlib.md5() md5.update(b) h = md5.digest() hlist = d.get(h) if not hlist: hlist = [] d[h] = hlist # print "New hash %s at %8.8x" % (h.encode('hex'), pos) hlist.append(pos) pos += 4096 b = f.read(4096) f.close() print "%d Unique hashes in %s" % (len(d), sys.argv[1]) def checkAndPrint(match): if match[2]>4096: print "%20s @ %12.12x:%12.12x ~= %8.8x:%8.8x" % (fname, match[1], match[1]+match[2]-1, match[0], match[0]+match[2]-1) # Read the candidate files/devices, looking for possible matches. Match # entries are vectors of known file offset, candidate file offset, and # length. for fname in sys.argv[2:]: print "\nSearching for pieces of %s in %s:..." % (sys.argv[1], fname) pos = long(0) f = open(fname, 'r', 1<<24) matches = [] b = f.read(4096) lastts = None while len(b)==4096: if not (pos & 0x7ffffff): ts = datetime.datetime.now() if lastts: print "@ %12.12x %.1fMB/s \r" % (pos, 128.0/((ts-lastts).total_seconds())), else: print "@ %12.12x...\r" % pos, sys.stdout.flush() lastts = ts md5 = hashlib.md5() md5.update(b) h = md5.digest() if h in d: i = 0 while i