-
-
Save jcjveraa/18800e6ee82a682316a7b2b8ab76e2bc to your computer and use it in GitHub Desktop.
Fast duplicate file finder written in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Fast duplicate file finder. | |
Usage: duplicates.py [-hcf] <folder> [<folder>...] | |
Based on https://stackoverflow.com/a/36113168/300783 | |
Modified for Python3 with some small code improvements. | |
Modified by jcjveraa to include option to hardlink any duplicate files | |
""" | |
import os | |
import sys | |
import hashlib | |
from collections import defaultdict | |
from operator import itemgetter | |
def chunk_reader(fobj, chunk_size=1024): | |
""" Generator that reads a file in chunks of bytes """ | |
while True: | |
chunk = fobj.read(chunk_size) | |
if not chunk: | |
return | |
yield chunk | |
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1): | |
hashobj = hash_algo() | |
with open(filename, "rb") as f: | |
if first_chunk_only: | |
hashobj.update(f.read(1024)) | |
else: | |
for chunk in chunk_reader(f): | |
hashobj.update(chunk) | |
return hashobj.digest() | |
def check_for_duplicates(paths, ignore_hardlinks=False, create_hardlinks=False, force=False): | |
files_by_size = defaultdict(list) | |
files_by_small_hash = defaultdict(list) | |
files_by_full_hash = dict() | |
hashes_by_inode = dict() | |
true_duplicates = defaultdict(set) | |
if create_hardlinks and not force: | |
proceed_command = 'proceed' | |
answer = input("WARNING, you have enabled creation of hardlinks for: " | |
+ str(paths) + | |
" (relative to " + os.getcwd() + | |
" in case of relative paths).\n\n" | |
"This will result in any found duplicates being hardlinked together. " | |
"The intended usecase of this is deduplication of 'read only' backups. " | |
"Continuing may distrupt or even destroy your (file)system if this is not your intent.\n" | |
"Please type '" + proceed_command + "' to continue:\n") | |
if answer != proceed_command: | |
exit('Exiting execution, no hardlinks were created.' | |
'Run the proggram again without the -hc option to just get a list of duplicates.') | |
for path in paths: | |
for dirpath, _, filenames in os.walk(path): | |
for filename in filenames: | |
full_path = os.path.join(dirpath, filename) | |
try: | |
# if the target is a symlink (soft one), this will | |
# dereference it - change the value to the actual target file | |
full_path = os.path.realpath(full_path) | |
file_size = os.path.getsize(full_path) | |
except OSError: | |
# not accessible (permissions, etc) - pass on | |
continue | |
files_by_size[file_size].append(full_path) | |
# For all files with the same file size, get their hash on the first 1024 bytes | |
i = 0 | |
for file_size, files in files_by_size.items(): | |
i += 1 | |
progress = 100.0*i / len(files_by_size.values()) | |
# print('Small hash progress...' + str(progress)) | |
if len(files) < 2: | |
continue # this file size is unique, no need to spend cpu cycles on it | |
# Check if all files are already hard (or symlinks) to each other | |
if ignore_hardlinks: | |
try: | |
if len(files) == os.stat(files[0]).st_nlink: | |
continue # all files are hardlinked together | |
except OSError: | |
# not accessible (permissions, etc) - pass on | |
continue | |
for filename in files: | |
try: | |
small_hash = get_hash(filename, first_chunk_only=True) | |
except OSError: | |
# the file access might've changed till the exec point got here | |
continue | |
files_by_small_hash[(file_size, small_hash)].append(filename) | |
# For all files with the hash on the first 1024 bytes, get their hash on the full | |
# file - collisions will be duplicates | |
i = 0 | |
for files in files_by_small_hash.values(): | |
i += 1 | |
progress = 100.0*i / len(files_by_small_hash.values()) | |
# print('Large hash progress...' + str(progress)) | |
if len(files) < 2: | |
# the hash of the first 1k bytes is unique -> skip this file | |
continue | |
if ignore_hardlinks: | |
try: | |
if len(files) == os.stat(files[0]).st_nlink: | |
continue # all files are hardlinked together | |
except OSError: | |
# not accessible (permissions, etc) - pass on | |
continue | |
for filename in files: | |
try: | |
# don't calculate hashes twice for the same (hardlinked) file. | |
# TODO maybe this needs to be surrounded by an if statement for filesystems | |
# that do not support st_ino (if any)? | |
file_inode = os.stat(filename).st_ino | |
if file_inode in hashes_by_inode: | |
full_hash = hashes_by_inode[file_inode] | |
else: | |
full_hash = get_hash(filename, first_chunk_only=False) | |
hashes_by_inode[file_inode] = full_hash | |
except OSError: | |
# the file access might've changed till the exec point got here | |
continue | |
if full_hash in files_by_full_hash: | |
duplicate = files_by_full_hash[full_hash] | |
# add files to hardlink-list here to ensure only files w. duplicates are added | |
true_duplicates[full_hash].add(filename) | |
true_duplicates[full_hash].add(duplicate) | |
else: | |
files_by_full_hash[full_hash] = filename | |
for full_hash, files in true_duplicates.items(): | |
inodes = set([os.stat(filename).st_ino for filename in files]) | |
print('Duplicates found, hash is ' + full_hash.hex() + | |
' on ' + str(len(inodes)) + ' different locations:') | |
for filename in files: | |
print(' - ' + str(filename) + ', inode ' + | |
str(os.stat(filename).st_ino)) | |
if not create_hardlinks: | |
exit(0) | |
for _, files in true_duplicates.items(): | |
print('All duplicates are now being hardlinked...') | |
mtimes = [(filename, os.path.getmtime(filename)) for filename in files] | |
# keep the file with the most recent 'mtime' as the source (this may not do anything) | |
most_recently_modified_file = max(mtimes, key=itemgetter(1))[0] | |
files.remove(most_recently_modified_file) | |
for filename in files: | |
temp_file_name = filename+'.recover' | |
os.rename(filename, temp_file_name) | |
os.link(most_recently_modified_file, filename) | |
os.unlink(temp_file_name) | |
print('Hardlinking completed...') | |
if __name__ == "__main__": | |
if sys.argv[1:]: | |
if sys.argv[1] == '-h': | |
check_for_duplicates(sys.argv[2:], True) | |
if sys.argv[1] == '-hc': | |
check_for_duplicates( | |
sys.argv[2:], ignore_hardlinks=True, create_hardlinks=True) | |
if sys.argv[1] == '-hcf': | |
check_for_duplicates( | |
sys.argv[2:], ignore_hardlinks=True, create_hardlinks=True, force=True) | |
else: | |
check_for_duplicates(sys.argv[1:]) | |
else: | |
print("Usage: %s [-hcf] <folder> [<folder>...]" % sys.argv[0]) | |
print("\t-h\t ignore hardlinks (do not count hardlinked files as duplicates).") | |
print("\t-hc\t create hardlinks (hardlink duplicates together, for deduplication - CAN BE DANGEROUS!).") | |
print("\t-hcf\t as -hc but without asking for confirmation.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment