我今天用一个简单的脚本来校验所有可用的 hashlib 算法(md5,sha1 .....)中的文件. 有趣的是它适用于小文件,但不适用于大文件。我认为缓冲文件的方式有问题,但是错误消息让我认为这与我执行 hexdigest 的方式有关(我认为)这是我整个脚本的副本,所以随意复制它,使用它并帮助我找出问题所在。校验 250 MB 文件时出现的错误是
“'utf-8' 编解码器无法解码位置 10 中的字节 0xf3:无效的继续字节”
我谷歌它,但找不到任何修复它的东西。另外,如果您看到更好的优化方法,请告诉我。我的主要目标是在 Python 3 中 100% 完成工作。谢谢
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
algorithmType = getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
with open(path) as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
yield algorithmType.hexdigest()
except Exception as e:
print (e)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
这是在 Python 2 中工作的代码,我只是把它放在你想使用它的情况下,而不必修改上面的代码。
import hashlib
import argparse
def hashFile(algorithm = "md5", filepaths=[], blockSize=4096):
Hashes a file. In oder to reduce the amount of memory used by the script, it hashes the file in chunks instead of putting
the whole file in memory
algorithmType = hashlib.new(algorithm) #getattr(hashlib, algorithm.lower())() #Default: hashlib.md5()
#Open file and extract data in chunks
for path in filepaths:
with open(path, mode = 'rb') as f:
while True:
dataChunk = f.read(blockSize)
if not dataChunk:
yield algorithmType.hexdigest()
except Exception as e:
print e
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filepaths', nargs="+", help='Specified the path of the file(s) to hash')
parser.add_argument('-a', '--algorithm', action='store', dest='algorithm', default="md5",
help='Specifies what algorithm to use ("md5", "sha1", "sha224", "sha384", "sha512")')
arguments = parser.parse_args()
#Call generator function to yield hash value
algo = arguments.algorithm
if algo.lower() in ("md5", "sha1", "sha224", "sha384", "sha512"):
for hashValue in hashFile(algo, arguments.filepaths):
print hashValue
print "Algorithm {0} is not available in this script".format(algorithm)
if __name__ == "__main__":