python - 创建和流式传输大型存档，而不将其存储在内存或磁盘中

Question

我想允许用户一次下载多个大文件的存档。但是，文件和存档可能太大而无法存储在我的服务器上的内存或磁盘中（它们是从其他服务器即时流式传输的）。我想在将存档流式传输给用户时生成存档。

我可以使用 Tar 或 Zip 或任何最简单的方法。我正在使用 django，它允许我在响应中返回生成器或类似文件的对象。该对象可用于推动该过程。但是，我很难弄清楚如何围绕 zipfile 或 tarfile 库构建这种东西，而且我担心它们可能不支持在运行时读取文件，或者在构建存档时读取它。

这个关于将迭代器转换为类文件对象的答案可能会有所帮助。 tarfile#addfile需要一个可迭代的，但它似乎立即将它传递给shutil.copyfileobj，所以这可能不像我希望的那样对生成器友好。

score 9 · Accepted Answer

9

我最终使用了SpiderOak ZipStream。

于 2012-10-10T01:42:52.173 回答

score 7 · Accepted Answer

您可以将 ZipFile 流式传输到 Pylons 或 Django 响应 fileobj，方法是将 fileobj 包装在实现tell(). 这将在内存中缓冲 zip 中的每个单独文件，但流式传输 zip 本身。我们使用它来流式下载一个充满图像的 zip 文件，因此我们永远不会在内存中缓冲超过一个图像。

此示例流向sys.stdout. 对于 Pylons 使用response.body_file，对于 Django，您可以将其HttpResponse本身用作文件。

import zipfile
import sys


class StreamFile(object):
    def __init__(self, fileobj):
        self.fileobj = fileobj
        self.pos = 0

    def write(self, str):
        self.fileobj.write(str)
        self.pos += len(str)

    def tell(self):
        return self.pos

    def flush(self):
        self.fileobj.flush()


# Wrap a stream so ZipFile can use it
out = StreamFile(sys.stdout)
z = zipfile.ZipFile(out, 'w', zipfile.ZIP_DEFLATED)

for i in range(5):
    z.writestr("hello{0}.txt".format(i), "this is hello{0} contents\n".format(i) * 3)

z.close()

score 7 · Accepted Answer

您可以通过生成和流式传输没有压缩的 zip 文件来做到这一点，这基本上只是在每个文件的内容之前添加标题。你是对的，图书馆不支持这一点，但你可以绕过它们来让它工作。

此代码使用一个类包装 zipfile.ZipFile，该类管理流并在文件到来时为文件创建 zipfile.ZipInfo 的实例。最后可以设置CRC和大小。您可以使用 put_file()、write() 和 flush() 将输入流中的数据推送到其中，并使用 read() 将数据从中读取到输出流中。

import struct      
import zipfile
import time

from StringIO import StringIO

class ZipStreamer(object):
    def __init__(self):
        self.out_stream = StringIO()

        # write to the stringIO with no compression
        self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)

        self.current_file = None

        self._last_streamed = 0

    def put_file(self, name, date_time=None):
        if date_time is None:
            date_time = time.localtime(time.time())[:6]

        zinfo = zipfile.ZipInfo(name, date_time)
        zinfo.compress_type = zipfile.ZIP_STORED
        zinfo.flag_bits = 0x08
        zinfo.external_attr = 0600 << 16
        zinfo.header_offset = self.out_stream.pos

        # write right values later
        zinfo.CRC = 0
        zinfo.file_size = 0
        zinfo.compress_size = 0

        self.zipfile._writecheck(zinfo)

        # write header to stream
        self.out_stream.write(zinfo.FileHeader())

        self.current_file = zinfo

    def flush(self):
        zinfo = self.current_file
        self.out_stream.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
        self.zipfile.filelist.append(zinfo)
        self.zipfile.NameToInfo[zinfo.filename] = zinfo
        self.current_file = None

    def write(self, bytes):
        self.out_stream.write(bytes)
        self.out_stream.flush()
        zinfo = self.current_file
        # update these...
        zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
        zinfo.file_size += len(bytes)
        zinfo.compress_size += len(bytes)

    def read(self):
        i = self.out_stream.pos

        self.out_stream.seek(self._last_streamed)
        bytes = self.out_stream.read()

        self.out_stream.seek(i)
        self._last_streamed = i

        return bytes

    def close(self):
        self.zipfile.close()

请记住，这段代码只是概念的快速证明，一旦我决定让 http 服务器自己处理这个问题，我就没有做进一步的开发或测试。如果您决定使用它，您应该检查的几件事是检查嵌套文件夹是否正确存档，以及文件名编码（无论如何，这对于 zip 文件来说总是很痛苦）。

score 3 · Accepted Answer

这是 Pedro Werneck 的解决方案（从上面），但有一个修复程序以避免收集内存中的所有数据（read方法有点固定）：

class ZipStreamer(object):
    def __init__(self):
        self.out_stream = StringIO.StringIO()

        # write to the stringIO with no compression
        self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)

        self.current_file = None

        self._last_streamed = 0

    def put_file(self, name, date_time=None):
        if date_time is None:
            date_time = time.localtime(time.time())[:6]

        zinfo = zipfile.ZipInfo(name, date_time)
        zinfo.compress_type = zipfile.ZIP_STORED
        zinfo.flag_bits = 0x08
        zinfo.external_attr = 0600 << 16
        zinfo.header_offset = self.out_stream.pos

        # write right values later
        zinfo.CRC = 0
        zinfo.file_size = 0
        zinfo.compress_size = 0

        self.zipfile._writecheck(zinfo)

        # write header to mega_streamer
        self.out_stream.write(zinfo.FileHeader())

        self.current_file = zinfo

    def flush(self):
        zinfo = self.current_file
        self.out_stream.write(
            struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
                        zinfo.file_size))
        self.zipfile.filelist.append(zinfo)
        self.zipfile.NameToInfo[zinfo.filename] = zinfo
        self.current_file = None

    def write(self, bytes):
        self.out_stream.write(bytes)
        self.out_stream.flush()
        zinfo = self.current_file
        # update these...
        zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
        zinfo.file_size += len(bytes)
        zinfo.compress_size += len(bytes)

    def read(self):
        self.out_stream.seek(self._last_streamed)
        bytes = self.out_stream.read()
        self._last_streamed = 0

        # cleaning up memory in each iteration
        self.out_stream.seek(0) 
        self.out_stream.truncate()
        self.out_stream.flush()

        return bytes

    def close(self):
        self.zipfile.close()

然后您可以将stream_generator函数用作 zip 文件的流

def stream_generator(files_paths):
    s = ZipStreamer()
    for f in files_paths:
        s.put_file(f)
        with open(f) as _f:
            s.write(_f.read())
        s.flush()
        yield s.read()
    s.close()

猎鹰的例子：

class StreamZipEndpoint(object):
    def on_get(self, req, resp):
        files_pathes = [
            '/path/to/file/1',
            '/path/to/file/2',
        ]
        zip_filename = 'output_filename.zip'
        resp.content_type = 'application/zip'
        resp.set_headers([
            ('Content-Disposition', 'attachment; filename="%s"' % (
                zip_filename,))
        ])

        resp.stream = stream_generator(files_pathes)

score 0 · Accepted Answer

一个选项是使用stream-zip（完全披露：由我编写）

稍微修改其示例：

from datetime import datetime
from stream_zip import stream_zip, ZIP_64

def non_zipped_files():
    modified_at = datetime.now()
    perms = 0o600

    # Hard coded in this example, but in real cases could
    # for example yield data from a remote source
    def file_1_data():
        for i in range(0, 1000):
            yield b'Some bytes'

    def file_2_data():
        for i in range(0, 1000):
            yield b'Some bytes'

    yield 'my-file-1.txt', modified_at, perms, ZIP64, file_1_data()
    yield 'my-file-2.txt', modified_at, perms, ZIP64, file_2_data()

zipped_chunks = stream_zip(non_zipped_files())

# Can print each chunk, or return them to a client,
# say using Django's StreamingHttpResponse
for zipped_chunk in zipped_chunks:
    print(zipped_chunk)

python - 创建和流式传输大型存档，而不将其存储在内存或磁盘中

5 回答 5

Related

Reference