|
|
@@ -2,6 +2,9 @@ import glob
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
import hashlib
|
|
|
|
import hashlib
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
from multiprocessing import Pool
|
|
|
|
|
|
|
|
import queue
|
|
|
|
from PIL import Image, UnidentifiedImageError
|
|
|
|
from PIL import Image, UnidentifiedImageError
|
|
|
|
|
|
|
|
|
|
|
|
from imagehash import average_hash
|
|
|
|
from imagehash import average_hash
|
|
|
@@ -18,7 +21,7 @@ print('ENTER to continue, ctrl+c to cancel.')
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
input()
|
|
|
|
input()
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print('\nCancelled.')
|
|
|
|
print('Cancelled.')
|
|
|
|
os._exit(0)
|
|
|
|
os._exit(0)
|
|
|
|
|
|
|
|
|
|
|
|
print('Sorting file list by smallest size...')
|
|
|
|
print('Sorting file list by smallest size...')
|
|
|
@@ -36,51 +39,75 @@ for filename in filenames:
|
|
|
|
hasher.update(buf)
|
|
|
|
hasher.update(buf)
|
|
|
|
digest = hasher.hexdigest()
|
|
|
|
digest = hasher.hexdigest()
|
|
|
|
if digest in hashes:
|
|
|
|
if digest in hashes:
|
|
|
|
|
|
|
|
print('Found digest', digest, 'collision for', filename)
|
|
|
|
delete.add(filename)
|
|
|
|
delete.add(filename)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
hashes.add(digest)
|
|
|
|
hashes.add(digest)
|
|
|
|
|
|
|
|
|
|
|
|
print('Found', len(delete), 'duplicates by md5 hash.')
|
|
|
|
print('Found', len(delete), 'duplicates by md5 hash.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(4)
|
|
|
|
print('Deduplicating by media fingerprint...')
|
|
|
|
print('Deduplicating by media fingerprint...')
|
|
|
|
|
|
|
|
|
|
|
|
def get_image_hash(filename):
|
|
|
|
def get_image_hash(filename):
|
|
|
|
|
|
|
|
basename = os.path.basename(os.path.dirname(filename))
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
image = Image.open(filename)
|
|
|
|
image = Image.open(filename)
|
|
|
|
return str(average_hash(image)) + '_image'
|
|
|
|
return basename + str(average_hash(image)) + '_image'
|
|
|
|
except UnidentifiedImageError:
|
|
|
|
except UnidentifiedImageError:
|
|
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def get_video_hash(filename):
|
|
|
|
def get_video_hash(filename):
|
|
|
|
|
|
|
|
basename = os.path.basename(os.path.dirname(filename))
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
return str(VideoHash(path=filename).hash_hex) + '_video'
|
|
|
|
v = VideoHash(path=filename)
|
|
|
|
|
|
|
|
digest = str(v.hash_hex)
|
|
|
|
|
|
|
|
v.delete_storage_path()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return basename + digest + '_video'
|
|
|
|
except FFmpegFailedToExtractFrames:
|
|
|
|
except FFmpegFailedToExtractFrames:
|
|
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
count = 0
|
|
|
|
count = 0
|
|
|
|
total = len(filenames)
|
|
|
|
total = len(filenames)
|
|
|
|
|
|
|
|
|
|
|
|
for filename in filenames:
|
|
|
|
def hasher(filename):
|
|
|
|
count += 1
|
|
|
|
if filename in delete: return None
|
|
|
|
print('Hashing file', count, '/', total, end='\r')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if filename in delete: continue
|
|
|
|
print('Hashing file:', filename)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
digest = get_image_hash(filename) or get_video_hash(filename)
|
|
|
|
digest = get_image_hash(filename)# or get_video_hash(filename)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print('\nCancelled.')
|
|
|
|
print('Cancelled.')
|
|
|
|
os._exit(0)
|
|
|
|
os._exit(0)
|
|
|
|
except BaseException as e:
|
|
|
|
except BaseException as e:
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
|
|
|
|
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
|
|
|
|
print(filename)
|
|
|
|
print(filename)
|
|
|
|
continue
|
|
|
|
print(traceback.format_exc())
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(5)
|
|
|
|
if not digest: return None
|
|
|
|
|
|
|
|
|
|
|
|
if not digest: continue
|
|
|
|
return (filename, digest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with Pool() as pool:
|
|
|
|
|
|
|
|
results = pool.map(hasher, filenames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('Finished hashing.')
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
print('Checking digests:')
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for result in results:
|
|
|
|
|
|
|
|
if not result: continue
|
|
|
|
|
|
|
|
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
filename, digest = result
|
|
|
|
|
|
|
|
|
|
|
|
if digest in hashes:
|
|
|
|
if digest in hashes:
|
|
|
|
|
|
|
|
print('Found digest', digest, 'collision for', filename)
|
|
|
|
delete.add(filename)
|
|
|
|
delete.add(filename)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
hashes.add(digest)
|
|
|
|
hashes.add(digest)
|
|
|
@@ -88,6 +115,6 @@ for filename in filenames:
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
print('Found', len(delete), 'total duplicate files.')
|
|
|
|
print('Found', len(delete), 'total duplicate files.')
|
|
|
|
|
|
|
|
|
|
|
|
print('Deleting...')
|
|
|
|
for dupe in sorted(list(delete)):
|
|
|
|
for dupe in delete:
|
|
|
|
print('Deleting:', dupe)
|
|
|
|
os.remove(dupe)
|
|
|
|
os.remove(dupe)
|
|
|
|