## Versions
print("Versions", "____________", sys.version.replace("\n","\t"), pd.__version__, sep="\n")
#Versions
#____________
#3.6.2 |Anaconda custom (x86_64)| (default, Jul 20 2017, 13:14:59) #[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
#0.21.1
# How I wrote the object
def write_object(obj, path, compression="bz2", protocol=pickle.HIGHEST_PROTOCOL, **args):
"""
Extensions:
pickle ==> .pkl
gzipped-pickle ==> .pgz
bzip2-pickle ==> .pbz2
"""
if compression is not None:
if compression == "bz2":
f = bz2.BZ2File(path, "wb")
if compression == "gzip":
f = gzip.GzipFile(path, "wb")
else:
f = open(path, "wb")
pickle.dump(obj, f, protocol=protocol, **args)
f.close()
# How I'm trying to read it back in
path = "./Data/objects/counts.tsv.pbz2"
write_object(df, path) #df = pd.DataFrame of ints
f = bz2.open(path, "rb")
df_loaded = pickle.load(f)
f.close()
# ---------------------------------------------------------------------------
# OSError Traceback (most recent call last)
# <ipython-input-13-04a37c035163> in <module>()
# 1 path = "./Data/objects/counts.tsv.pbz2"
# 2 f = bz2.open(path, "rb")
# ----> 3 pickle.load(f)
# ~/anaconda/lib/python3.6/bz2.py in peek(self, n)
# 170 # always returns at least one byte (except at EOF), independent
# 171 # of the value of n
# --> 172 return self._buffer.peek(n)
# 173
# 174 def read(self, size=-1):
# ~/anaconda/lib/python3.6/_compression.py in readinto(self, b)
# 66 def readinto(self, b):
# 67 with memoryview(b) as view, view.cast("B") as byte_view:
# ---> 68 data = self.read(len(byte_view))
# 69 byte_view[:len(data)] = data
# 70 return len(data)
# ~/anaconda/lib/python3.6/_compression.py in read(self, size)
# 101 else:
# 102 rawblock = b""
# --> 103 data = self._decompressor.decompress(rawblock, size)
# 104 if data:
# 105 break
# OSError: Invalid data stream
该文件是一个285.5 MB
制表符分隔的整数表,其中~1.5M columns
和~100 rows
。gzipped
文件大小是42.4 MB
bz2 压缩的腌制pd.DataFrame
是34.2 MB
. 加载到 a 需要 45 分钟,pd.DataFrame
这就是我要序列化的原因。
以这种方式腌制的对象的大小是否有限制?
我问的原因是我以完全相同的方式处理的另一个计数表正在完美打开。文件大小为unprocessed .tsv = 148.9 MB
、 gzipped = 24.8 MB
和bz2-zipped pickled .pbz2 = 19.8 MB
。
唯一类似的问题,但答案无济于事: Python BZ2 IOError: invalid data stream