我在最初使用 (protocol=0) 格式腌制的多个子目录中有一个包含 170,000 多个腌制文件的目录。这在时间或空间方面都不是很有效。
我编写了一个脚本来重新腌制(使用 cPickle,协议 = 2)文件夹中的每个文件,但奇怪的是,该脚本在处理特定文件(文件 # 95,000)时抛出异常。最初,我认为泡菜文件已损坏。当我尝试从 IPython 命令行加载这个确切的泡菜文件时,文件加载得很好。
所以,我对为什么会发生这种情况感到目瞪口呆。这是我的脚本,感谢您的帮助:
import os
import cPickle
import numpy
import time
import re
from progressbar import ProgressBar
inpath = '/path/to/folder'
def list_files(dir):
r = []
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = os.walk(subdir).next()[2]
if (len(files) > 0):
for file in files:
r.append(subdir + "/" + file)
return r
infileList = list_files(inpath)
print "Total number of files found: %d" % len(infileList)
print "\n\n"
progress = ProgressBar()
outfilename = " "
print "Processing pickle files. Pls wait..."
t0 = time.time()
filecount = 0
for file in progress(infileList):
try:
arr = cPickle.load(open(file , "rb" ))
outfilename = re.sub('/initial/path/','/new/path/',file)
if not os.path.exists(os.path.dirname(outfilename)):
os.makedirs(os.path.dirname(outfilename))
with open(outfilename, "wb") as f:
cPickle.dump(arr,f,protocol=2)
filecount = filecount + 1
except Exception,e:
print "\n" + str(filecount)
print "\nError occured while processing file: " + outfilename
tx = time.time()
print "\n Time elapsed: %.2f" % (tx-t0)
continue
t1 = time.time()
total = t1-t0
print "Files repickled with protocol=2.\nRepickling execution time: %.2f sec" % total