I would like to retrieve a list of unique files by content rather than by filename.
That is, if spam.txt and eggs.txt both contained the same contents I want only one of them to return. A very simple approach is to compute a SHA-1 checksum on each file, and build a dictionary with the checksum as the unique key.
#!/usr/bin/env python
# vim: set tabstop=4 shiftwidth=4 autoindent smartindent:
import hashlib, sys
import logging
def _dupdoc(filelist):
'''
returns a list of unique files (by content rather than filename)
that is, if spam.txt and eggs.txt both contained the same contents,
only one filename will be returned
'''
shasums = {}
for file in filelist:
try:
fh = open(file, 'rb')
sha1 = hashlib.sha1(fh.read()).hexdigest()
if sha1 not in shasums:
shasums[sha1] = file
logging.debug('%s %s' %(file, sha1))
except IOError as e:
logging.warning('could not open %s' %(file))
uniquelist = [file for file in shasums.values()]
return uniquelist
if __name__ == "__main__":
'''
command-line, accept either a list of files in STDIN
or a single filename argument that contains a list of files
'''
filelist = []
if len(sys.argv) > 1:
fh = open(sys.argv[1], 'r')
filelist = fh.readlines()
fh.close()
else:
filelist = sys.stdin.readlines()
filelist = [file.strip() for file in filelist]
uniques = _dupdoc(filelist)
for file in uniques:
print file
The commandline __main__ portion of the program expects an optional command line argument, or if no argument is specified than a filelist will be read on STDIN, e.g.,
# find test -type f | dupdoc test/spam1.txt test/spam9.txt #