unix - 恢复前 361 个字节被截断的 GZIP 文件

Question

我有一个大小为 325 MB 的 gzip 文件。我只是认为它从一开始就被截断了 361 个字节。

请告知如何从中恢复压缩文件。

score 4 · Accepted Answer

您需要找到下一个放气块边界。这样的边界可以出现在任何位位置。您将需要尝试从每一位开始解压缩，直到您成功解码至少几个 deflate 块。

您可以使用zlibinflatePrime()向. inflate()您可以使用inflateSetDictionary()提供一个虚假的 32K 字典来在数据膨胀之前，以避免距离太远的错误。

一旦你找到了一个块边界，你就解决了一半的问题。下半部分是寻找放气流中的哪个位置不再依赖从丢失的 361 字节压缩数据中导出的未知未压缩数据。这种依赖可能会持续很长时间。例如，如果单词“ the ”出现在那个缺失的部分，那么它可以在之后被称为缺失的字符串。然而，你不知道它是“the”。您所知道的是，缺失数据中有一个对五字节字符串的引用。然后，该五字节字符串被复制到的位置本身可以被以后的匹配引用。原则上，这可以传播到整个 325 MB，使整个事情完全无法恢复。

然而，这不太可能。更有可能的是，在某些时候，前 361 个字节的字符串传播会停止。从那里开始，您可以恢复未压缩的数据。

为了判断您是否仍然看到传播，请进行两次减压。一次是全 0 的初始仿字典，一次是全 1 的初始仿字典。如果两次解压缩的解压缩数据相同，则您已成功恢复该数据。

然后，您将需要提升该数据的下一级结构，看看您是否可以以某种方式利用已恢复的内容。

祝你好运。并且下次不要切断前 361 个字节。

下面是执行上述操作的示例代码。

/* salvage -- recover data from a corrupted deflate stream
 * Copyright (C) 2015 Mark Adler
 * Version 1.0  28 June 2015  Mark Adler
 */

/*
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
  arising from the use of this software.

  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:

  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.

  Mark Adler
  madler@alumni.caltech.edu
 */

/* Attempt to recover deflate data from a corrupted stream.  The corrupted data
   is read on stdin, and any reliably decompressed data is written to stdout. A
   deflate stream is deemed to have been found successfully if there are eight
   or fewer bytes of compressed data unused when done.  This can be changed
   with the MAXLEFT macro below, or the conditional that currently uses
   MAXLEFT. */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include "zlib.h"

/* Get the size of an allocated piece of memory (usable size -- not necessarily
   the requested size). */
#if defined(__APPLE__) && defined(__MACH__)
#  include <malloc/malloc.h>
#  define memsize(p) malloc_size(p)
#elif defined (__linux__)
#  include <malloc.h>
#  define memsize(p) malloc_usable_size(p)
#elif defined (_WIN32)
#  include <malloc.h>
#  define memsize(p) _msize(p)
#else
#  error You need to find an allocated memory size function
#endif

#define local static

/* Load an entire file into a memory buffer.  load() returns 0 on success, in
   which case it puts all of the file data in *dat[0..*len - 1].  That is,
   unless *len is zero, in which case *dat is NULL.  *data is allocated memory
   which should be freed when done with it.  load() returns zero on success,
   with *data == NULL and *len == 0.  The error values are -1 for read error or
   1 for out of memory.  To guard against bogging down the system with
   extremely large allocations, if limit is not zero then load() will return an
   out of memory error if the input is larger than limit. */
local int load(FILE *in, unsigned char **data, size_t *len, size_t limit)
{
    size_t size = 1048576, have = 0, was;
    unsigned char *buf = NULL, *mem;

    *data = NULL;
    *len = 0;
    if (limit == 0)
        limit--;
    if (size >= limit)
        size = limit - 1;
    do {
        /* if we already saturated the size_t type or reached the limit, then
           out of memory */
        if (size == limit) {
            free(buf);
            return 1;
        }

        /* double size, saturating to the maximum size_t value */
        was = size;
        size <<= 1;
        if (size < was || size > limit)
            size = limit;

        /* reallocate buf to the new size */
        mem = realloc(buf, size);
        if (mem == NULL) {
            free(buf);
            return 1;
        }
        buf = mem;

        /* read as much as is available into the newly allocated space */
        have += fread(buf + have, 1, size - have, in);

        /* if we filled the space, make more space and try again until we don't
           fill the space, indicating end of file */
    } while (have == size);

    /* if there was an error reading, discard the data and return an error */
    if (ferror(in)) {
        free(buf);
        return -1;
    }

    /* if a zero-length file is read, return NULL for the data pointer */
    if (have == 0) {
        free(buf);
        return 0;
    }

    /* resize the buffer to be just big enough to hold the data */
    mem = realloc(buf, have);
    if (mem != NULL)
        buf = mem;

    /* return the data */
    *data = buf;
    *len = have;
    return 0;
}

#define DICTSIZE 32768
#if UINT_MAX <= 0xffff
#  define BUFSIZE 32768
#else
#  define BUFSIZE 1048576
#endif

/* Inflate the provided buffer starting at a specified bit offset.  Use an
   already-initialized inflate stream structure for rapid repeated attempts.
   The structure needs to have been initialized using inflateInit2(strm, -15).
   Inflation begins at data[off], starting at bit bit in that byte, going from
   that bit to the more significant bits in that byte, and then on to the next
   byte.  bit must be in the range 0..7.  bit == 0 uses the entire byte at
   data[off].  bit == 7 uses only the most significant bit of the byte at
   data[off].  Before inflation, the dictionary is initialized to
   dict[0..DICTSIZE-1] so that references before the start of the uncompressed
   data do not stop inflation.  Inflation continues as long as possible, until
   either an error is encountered, the end of the deflate stream is reached, or
   data[len-1] is processed.  On entry *recoup is a pointer to allocated memory
   or NULL, and on return *recoup points to allocated memory with the
   decompressed data. *got is set to the number of bytes of decompressed data
   returned at *recoup.

   inflate_at() returns Z_DATA_ERROR if an error was detected in the alleged
   deflate data, Z_STREAM_END if the end of a valid deflate stream was reached,
   or Z_OK if the end of the provided compressed data was reached without
   encountering an erorr or the end of the stream. */
local int inflate_at(z_stream *strm, unsigned char *data, size_t len,
                     size_t off, int bit, size_t *unused, unsigned char *dict,
                     unsigned char **recoup, size_t *got)
{
    int ret;
    size_t left, size;

    /* check input */
    assert(data != NULL && off < len && bit >= 0 && bit <= 7);
    assert(dict != NULL && recoup != NULL);

    /* set up inflate engine, feeding first few bits if necessary */
    ret = inflateReset(strm);
    assert(ret == Z_OK);
    ret = inflateSetDictionary(strm, dict, DICTSIZE);
    assert(ret == Z_OK);
    if (bit) {
        ret = inflatePrime(strm, 8 - bit, data[off] >> bit);
        assert(ret == Z_OK);
        off++;
    }

    /* inflate as much as possible */
    strm->next_in = data + off;
    left = len - off;
    *got = 0;
    do {
        strm->avail_in = left > UINT_MAX ? UINT_MAX : left;
        left -= strm->avail_in;
        do {
            /* assure at least BUFSIZE available in recoup */
            size = memsize(*recoup);
            if (*got + BUFSIZE > size) {
                size = size ? size << 1 : BUFSIZE;
                assert(size != 0);
                *recoup = reallocf(*recoup, size);
                assert(*recoup != NULL);
            }

            /* inflate into recoup */
            strm->next_out = *recoup + *got;
            strm->avail_out = BUFSIZE;
            ret = inflate(strm, Z_NO_FLUSH);
            assert(ret != Z_STREAM_ERROR && ret != Z_MEM_ERROR);

            /* set the number of compressed bytes unused so far, in case we
               return */
            if (unused != NULL)
                *unused = left + strm->avail_in;

            /* update the number of uncompressed bytes generated */
            *got += BUFSIZE - strm->avail_out;

            /* if we cannot continue to decompress, then return the reason */
            if (ret == Z_DATA_ERROR || ret == Z_STREAM_END)
                return ret;

            /* continue with provided input data until all output generated */
        } while (strm->avail_out == 0);
        assert(strm->avail_in == 0);

        /* provide more input data, if any */
    } while (left);

    /* ran through all compressed data with no errors or end of stream */
    return Z_OK;
}

/* The criteria for success is the completion of inflate with no more than this
   many bytes unused.  (8 is the length of a gzip trailer.) */
#define MAXLEFT 8

/* Read a corrupted (or not) deflate stream from stdin and write the reliably
   recovered data to stdout. */
int main(void)
{
    int ret, bit;
    unsigned char *data = NULL, *recoup = NULL, *comp = NULL;
    size_t len, off, unused, got;
    z_stream strm;
    unsigned char dict[DICTSIZE] = {0};

    /* read input into memory */
    ret = load(stdin, &data, &len, 0);
    if (ret < 0)
        fprintf(stderr, "file error reading input\n");
    if (ret > 0)
        fprintf(stderr, "ran out of memory reading input\n");
    assert(ret == 0);
    fprintf(stderr, "read %lu bytes\n", len);

    /* initialize inflate structure */
    strm.zalloc = Z_NULL;
    strm.zfree = Z_NULL;
    strm.opaque = Z_NULL;
    strm.next_in = Z_NULL;
    strm.avail_in = 0;
    ret = inflateInit2(&strm, -15);
    assert(ret == Z_OK);

    /* scan for an acceptable starting point for inflate */
    for (off = 0; off < len; off++)
        for (bit = 0; bit < 8; bit++) {
            ret = inflate_at(&strm, data, len, off, bit, &unused, dict,
                             &recoup, &got);
            if ((ret == Z_STREAM_END || ret == Z_OK) && unused <= MAXLEFT)
                goto done;
        }
  done:

    /* if met the criteria, show result and write out reliable data */
    if (bit != 8 && (ret == Z_STREAM_END || ret == Z_OK)) {
        fprintf(stderr,
                "decoded %lu bytes (%lu unused) at offset %lu, bit %d\n",
                len - off - unused, unused, off, bit);

        /* decompress again with a different dictionary to detect unreliable
           data */
        memset(dict, 1, DICTSIZE);
        inflate_at(&strm, data, len, off, bit, NULL, dict, &comp, &got);
        {
            unsigned char *p, *q;

            /* search backwards from the end for the first unreliable byte */
            p = recoup + got;
            q = comp + got;
            while (q > comp)
                if (*--p != *--q) {
                    p++;
                    q++;
                    break;
                }

            /* write out the reliable data */
            fwrite(q, 1, got - (q - comp), stdout);
            fprintf(stderr,
                    "%lu bytes of reliable uncompressed data recovered\n",
                    got - (q - comp));
            fprintf(stderr,
                    "(out of %lu total uncompressed bytes recovered)\n", got);
        }
    }

    /* otherwise declare failure */
    else
        fprintf(stderr, "no deflate stream found that met criteria\n");

    /* clean up */
    free(comp);
    free(recoup);
    inflateEnd(&strm);
    free(data);
    return 0;
}

unix - 恢复前 361 个字节被截断的 GZIP 文件

1 回答 1

Related

Reference