我最近发现自己遇到了类似的问题,为此我编写了几个函数,用于将 dicts 的内容保存到 PyTables 文件中的组中,并将它们加载回 dicts。
它们递归地处理嵌套的字典和组结构,并通过对对象进行腌制并将它们存储为字符串数组来处理 PyTables 本身不支持的类型的对象。它并不完美,但至少像 numpy 数组这样的东西会被有效地存储。还包括一项检查,以避免在将组内容读回字典时无意中将巨大的结构加载到内存中。
import tables
import cPickle
def dict2group(f, parent, groupname, dictin, force=False, recursive=True):
"""
Take a dict, shove it into a PyTables HDF5 file as a group. Each item in
the dict must have a type and shape compatible with PyTables Array.
If 'force == True', any existing child group of the parent node with the
same name as the new group will be overwritten.
If 'recursive == True' (default), new groups will be created recursively
for any items in the dict that are also dicts.
"""
try:
g = f.create_group(parent, groupname)
except tables.NodeError as ne:
if force:
pathstr = parent._v_pathname + '/' + groupname
f.removeNode(pathstr, recursive=True)
g = f.create_group(parent, groupname)
else:
raise ne
for key, item in dictin.iteritems():
if isinstance(item, dict):
if recursive:
dict2group(f, g, key, item, recursive=True)
else:
if item is None:
item = '_None'
f.create_array(g, key, item)
return g
def group2dict(f, g, recursive=True, warn=True, warn_if_bigger_than_nbytes=100E6):
"""
Traverse a group, pull the contents of its children and return them as
a Python dictionary, with the node names as the dictionary keys.
If 'recursive == True' (default), we will recursively traverse child
groups and put their children into sub-dictionaries, otherwise sub-
groups will be skipped.
Since this might potentially result in huge arrays being loaded into
system memory, the 'warn' option will prompt the user to confirm before
loading any individual array that is bigger than some threshold (default
is 100MB)
"""
def memtest(child, threshold=warn_if_bigger_than_nbytes):
mem = child.size_in_memory
if mem > threshold:
print '[!] "%s" is %iMB in size [!]' % (child._v_pathname, mem / 1E6)
confirm = raw_input('Load it anyway? [y/N] >>')
if confirm.lower() == 'y':
return True
else:
print "Skipping item \"%s\"..." % g._v_pathname
else:
return True
outdict = {}
for child in g:
try:
if isinstance(child, tables.group.Group):
if recursive:
item = group2dict(f, child)
else:
continue
else:
if memtest(child):
item = child.read()
if isinstance(item, str):
if item == '_None':
item = None
else:
continue
outdict.update({child._v_name: item})
except tables.NoSuchNodeError:
warnings.warn('No such node: "%s", skipping...' % repr(child))
pass
return outdict
还值得一提的是joblib.dump
和joblib.load
,除了 Python 2/3 交叉兼容性之外,它勾选了所有选项。在引擎盖下,他们np.save
用于 numpy 数组和cPickle
其他所有内容。