I worked today in a simple script to checksum files in all available hashlib algorithms (md5, sha1.....) I wrote it and debug it with Python2, but when I decided to port it to Python 3 it just won't work. The funny thing is that it works for small files, but not for big files. I thought there was a problem with the way I was buffering the file, but the error message is what makes me think it is something related to the way I am doing the hexdigest (I think) Here is a copy of my entire script, so feel free to copy it, use it and help me figure out what the problem is with it. The error I get when checksuming a 250 MB file is
"'utf-8' codec can't decode byte 0xf3 in position 10: invalid continuation byte"
I google it, but can't find anything that fixes it. Also if you see better ways to optimize it, please let me know. My main goal is to make work 100% in Python 3. Thanks
#!/usr/local/bin/python33
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
algorithmType = getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
try:
with open(path) as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
break
algorithmType.update(dataChunk.encode())
yield algorithmType.hexdigest()
except Exception as e:
print (e)
def main():
#DEFINE ARGUMENTS
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
Here is the code that works in Python 2, I will just put it in case you want to use it without having to modigy the one above.
#!/usr/bin/python
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
'''
Hashes a file. In oder to reduce the amount of memory used by the script, it hashes the file in chunks instead of putting
the whole file in memory
'''
algorithmType = hashlib.new(algorithm) #getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
try:
with open(path, mode = 'rb') as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
break
algorithmType.update(dataChunk)
yield algorithmType.hexdigest()
except Exception as e:
print e
def main():
#DEFINE ARGUMENTS
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
#Call generator function to yield hash value
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
for hashValue in hashFile(algo, arguments.filepaths):
print hashValue
else:
print "Algorithm {0} is not available in this script".format(algorithm)
if __name__ == "__main__":
main()