图像数据集包含大约70,000张图像。我试图从中创建 6 个泡菜文件。因此,每个 pickle 转储文件包含大约11519个预处理图像。磁盘上的图像文件夹大小约为2.5G。但是,每个 pickle 文件生成的大小为6.45G。
总共6.45 * 6 = 38.7 G!!
这是预期的行为还是我做错了什么?我使用以下脚本生成泡菜文件。
import os
import pickle
import numpy as np
from PIL import Image
n = 11519
images = []
DATADIR = r"path/to/images"
def data_preprocessing(samples, img_size):
img_width, img_height, channels = img_size, img_size, 3
image_list = [np.array((Image.open(os.path.join(DATADIR, filename)).convert('RGB')).resize((img_width, img_height), Image.ANTIALIAS)) for filename in samples]
images = np.asarray(image_list).astype('float32') / 255.0
ds_samples = np.reshape(images, (len(samples), img_width, img_height, channels))
return ds_samples
def divide_chunks(samples, n):
# looping till length samples
for i in range(0, len(samples), n):
yield samples[i:i + n]
# Appending files from directory to list
for file in (os.listdir(DATADIR)):
images.append(file)
# Creating parts
sub_samples = list(divide_chunks(images, n))
# Saving pickle files
for chunk in range(len(sub_samples)):
outfile = open("sample_{}.pkl".format(chunk), 'wb')
pickle.dump(data_preprocessing(sub_samples[chunk], 224), outfile, protocol=4)
outfile.close()