1

图像数据集包含大约70,000张图像。我试图从中创建 6 个泡菜文件。因此,每个 pickle 转储文件包含大约11519个预处理图像。磁盘上的图像文件夹大小约为2.5G。但是,每个 pickle 文件生成的大小为6.45G

总共6.45 * 6 = 38.7 G!!

这是预期的行为还是我做错了什么?我使用以下脚本生成泡菜文件。

import os
import pickle
import numpy as np
from PIL import Image

n = 11519
images = []
DATADIR = r"path/to/images"

def data_preprocessing(samples, img_size):
    img_width, img_height, channels = img_size, img_size, 3
    image_list = [np.array((Image.open(os.path.join(DATADIR, filename)).convert('RGB')).resize((img_width, img_height), Image.ANTIALIAS)) for filename in samples]
    images = np.asarray(image_list).astype('float32') / 255.0
    ds_samples = np.reshape(images, (len(samples), img_width, img_height, channels))
    return ds_samples


def divide_chunks(samples, n): 
    # looping till length samples 
    for i in range(0, len(samples), n):  
        yield samples[i:i + n] 

# Appending files from directory to list
for file in (os.listdir(DATADIR)):
    images.append(file)

# Creating parts
sub_samples = list(divide_chunks(images, n)) 

# Saving pickle files
for chunk in range(len(sub_samples)):
  outfile = open("sample_{}.pkl".format(chunk), 'wb')
  pickle.dump(data_preprocessing(sub_samples[chunk], 224), outfile, protocol=4)
  outfile.close()
4

0 回答 0