I would like to retrieve a list of unique files by content rather than by filename.
That is, if spam.txt and eggs.txt both contained the same contents I want only one of them to return. A very simple approach is to compute a SHA-1 checksum on each file, and build a dictionary with the checksum as the unique key.
#!/usr/bin/env python # vim: set tabstop=4 shiftwidth=4 autoindent smartindent: import hashlib, sys import logging def _dupdoc(filelist): ''' returns a list of unique files (by content rather than filename) that is, if spam.txt and eggs.txt both contained the same contents, only one filename will be returned ''' shasums = {} for file in filelist: try: fh = open(file, 'rb') sha1 = hashlib.sha1(fh.read()).hexdigest() if sha1 not in shasums: shasums[sha1] = file logging.debug('%s %s' %(file, sha1)) except IOError as e: logging.warning('could not open %s' %(file)) uniquelist = [file for file in shasums.values()] return uniquelist if __name__ == "__main__": ''' command-line, accept either a list of files in STDIN or a single filename argument that contains a list of files ''' filelist = [] if len(sys.argv) > 1: fh = open(sys.argv[1], 'r') filelist = fh.readlines() fh.close() else: filelist = sys.stdin.readlines() filelist = [file.strip() for file in filelist] uniques = _dupdoc(filelist) for file in uniques: print file
The commandline __main__ portion of the program expects an optional command line argument, or if no argument is specified than a filelist will be read on STDIN, e.g.,
# find test -type f | dupdoc test/spam1.txt test/spam9.txt #