1

Linux。我正在通过 libcurl 下载大量数据。一个数据具有低熵但巨大的大小(数十 Gb)。目前我首先下载文件,然后用 zlib 压缩它。
但这需要更多时间和更多空间。所以我试图在 libcurl 写回调中实现压缩。
主要问题是要下载的数据的确切大小是未知的。代码很脏,但它只是一个测试。看来这不是一条可行的路。也许其他方式更合适?

unsigned char *outZ=malloc(1500);//Maximum write chunk is 1448
...
size_t
curl_write_func(void *ptr, size_t size, size_t nmemb, void *stream)
{   
    size_t data=zip(ptr,size*nmemb,outZ,1500,Z_FINISH);    
    fwrite(outZ, 1, data, (FILE *) stream); 
    return (size*nmemb);
}

size_t
zip(void *source, size_t src_size, void *target,size_t tgt_size, int mode)
{   
    z_stream strm;          
    strm.zalloc = Z_NULL;
    strm.zfree = Z_NULL;
    strm.opaque = Z_NULL;
    deflateInit(&strm, Z_BEST_COMPRESSION);
    strm.next_in = source;
    strm.avail_in = src_size;
    strm.next_out=target;
    strm.avail_out=tgt_size;
    deflate(&strm, mode);
    deflateEnd(&strm);
    return (tgt_size - strm.avail_out);
}
4

2 回答 2

1

At last I made it work with native zlib library
It may be not elegant enough, but my C skills are poor at the moment
Some variables are have to be global as zlib stream should be uninterrupted.
I compress the previous block as the last one should be flushed and I didn't find a way to determinate it via libcurl calls.
So if we out of easy_perform the previous ine was the last :)
All error checking are omitted for clarity
SHA1 checksum of original stream is calculated also

#define CHUNK 16384
SHA_CTX ctx;
z_stream strm;
unsigned char old_block[CHUNK];
unsigned char out[CHUNK];
unsigned have;
size_t prevBlkSize;
char firstIter;

size_t
curl_write_func(void *ptr, size_t size, size_t nmemb, void *stream)
{
    //Stores the size of original data to write
    size_t orig_size=size*nmemb;
    if(firstIter)
    {
        memcpy(old_block,ptr,orig_size);
        prevBlkSize=orig_size;
        firstIter=0;
        SHA1_Update(&ctx, ptr, orig_size);
        return(orig_size);
    }
    //Compress old block with Z_NO_FLUSH

    strm.avail_in=prevBlkSize;
    strm.next_in = old_block;
    do
    {
        strm.avail_out = CHUNK;
        strm.next_out = out;
        deflate(&strm, Z_NO_FLUSH);
        have = CHUNK - strm.avail_out;
        fwrite(out, 1, have, (FILE *) stream);
    }
    while (strm.avail_out == 0);
    //

    memcpy(old_block,ptr,orig_size);
    prevBlkSize=orig_size;
    SHA1_Update(&ctx, ptr, orig_size);
    return (orig_size);
}
...
FILE *xva_export = fopen(xva_name, "wb");
//Prepare SHA1 and zlib
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
deflateInit(&strm, 9);
SHA1_Init(&ctx);
...
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write_func);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, xva_export);
curl_easy_perform(curl);
curl_easy_cleanup(curl);
//Finish zlib
strm.avail_in=prevBlkSize;
strm.next_in = old_block;
do
{
    strm.avail_out = CHUNK;
    strm.next_out = out;
    deflate(&strm, Z_FINISH);
    have = CHUNK - strm.avail_out;
    fwrite(out, 1, have, xva_export);
}
while (strm.avail_out == 0);
deflateEnd(&strm);
//SHA1 finish
SHA1_Final(hash, &ctx);
snprintf(sha1_name,sizeof(sha1_name),"%s.Z.sha1",xva_name);
FILE *sha1sum=fopen(sha1_name,"w");
for(int i=0; i<SHA_DIGEST_LENGTH; i++)
{
    fprintf(sha1sum,"%x",hash[i]);
}
fclose(sha1sum);
fclose(xva_export);
于 2013-03-28T15:38:58.360 回答
1

你控制服务器端发生的事情吗?

如果是,那么在关闭 libcurl 端的内容解码的同时询问已经 GZIP 编辑的内容怎么样?以下是如何进行(来自最近关于邮件列表的讨论):请求编码而不解码

否则,像libarchive这样的库- 提供流式传输功能,以流式方式馈送(例如带有 libcurl 的 la HTML 流式解析器)应该可以解决问题。

--

更新:这是一个使用zlib 动态压缩的示例代码:https ://github.com/deltheil/zurl 。

于 2013-03-26T10:55:14.397 回答