您可能知道,struct stat info
调用返回的是文件的总大小(以字节为单位),并且是存储在磁盘上的字节数。 stat()
info.st_size
info.st_blocks*512
在 Linux 中,文件系统将数据存储在对齐的info.st_blksize
字节块中。(这也意味着它info.st_blocks*512
可以大于info.st_size
(最多info.st_blksize-1
字节)。如果文件稀疏,它也可以更小。)
未存储的数据(稀疏文件中的空洞)和显式归零的存储数据都读为零。
如果您想知道文件中有多少个零填充块,则需要读取整个文件。使用大小为info.st_blksize
字节整数倍的缓冲区。对于每个对齐的st_blksize
字节块,检查它们是否全为零。令总块数(包括最后可能的部分块)为total_blocks
,且所有内容为零的块数为zero_blocks
。
struct stat info;
/* Number of filesystem blocks for the file */
total_blocks = info.st_size / info.st_blksize
+ (info.st_size % info.st_blksize) ? 1 : 0;
/* Number of bytes stored for the file */
stored_bytes = 512 * info.st_blocks;
/* Number of filesystem blocks used for file data */
stored_blocks = stored_bytes / info.st_blksize
+ (stored_bytes % info.st_blksize) ? 1 : 0;
/* Number of sparse blocks */
sparse_blocks = total_blocks - stored_blocks;
/* TODO: count zero_blocks,
* by reading file in info.st_blksize chunks,
* and saving the number of all-zero chunks
* in zero_blocks. */
/* Number of stored zero blocks */
zeroed_blocks = zero_blocks - sparse_blocks;
转换成字节,你有
info.st_size
是以字节为单位的文件大小
stored_blocks*info.st_blksize
是磁盘上使用的字节数
sparse_blocks*info.st_blksize
是磁盘上稀疏孔中的字节数
zeroed_blocks*info.st_blksize
是磁盘上不必要存储的零字节数;本来可以存储为稀疏孔
请注意,您可以使用cp --sparse=always --preserve=all SOURCEFILE TARGETFILE
创建文件的相同副本,但“优化”稀疏性,以便将足够长的零字节运行存储为空洞;这可以帮助您测试您的程序。详情请参阅man 1 cp
。您还可以使用dd if=/dev/zero of=TARGETFILE bs=BLOCKSIZE count=BLOCKS
;创建长的零序列。请参阅man 1 dd
和man 4 null
了解详情。
编辑添加:
这是一个示例函数,examine()
它打开指定的文件,获取统计信息,并在必要时(即请求存储不必要的零的数量)读取整个文件。
我只是对它进行了简单的测试,但它应该实现上面的逻辑。
它非常粗糙;我最关注的是正确的错误检查和动态内存分配/释放的正确性。(它应该检查并返回所有错误条件,甚至是一些不应该发生的错误,并且永远不会泄漏内存。也就是说,除非我在代码中有错误或想法——欢迎更正。)
最好将其拆分为更小、更易于管理的功能。
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
/* Return nonzero if the buffer is all zeros.
*/
static inline int is_zero(const void *const ptr, const size_t len)
{
const char *p = (const char *)ptr;
const char *const q = (const char *const)ptr + len;
while (p < q)
if (*(p++))
return 0;
return 1;
}
/* Return 0 if success, errno error code otherwise.
* (*sizeptr): File size in bytes
* (*blocksizeptr): File block size in bytes
* (*storedptr): Bytes stored on disk
* (*sparseptr): Bytes in sparse holes
* (*zeroedptr): Unnecessarily stored zero bytes
* If zeroedptr is NULL, the file is only opened and
* statistics obtained via fstat(). Otherwise, the entire
* file will be read.
* Special errors:
* EINVAL: NULL or empty file name
* EISDIR: Name refers to a directory
* EISNAM: Name refers to a pipe or device
* EBUSY: File was modified during read
*/
int examine(const char *const filename,
uint64_t *const sizeptr,
uint64_t *const blocksizeptr,
uint64_t *const storedptr,
uint64_t *const sparseptr,
uint64_t *const zeroedptr)
{
struct stat info;
int fd, result;
size_t size, have;
uint64_t total, nonzero, stored;
int cause = 0;
char *data = NULL;
/* Check for NULL or empty filename. */
if (!filename || !*filename)
return errno = EINVAL;
/* Open the specified file. */
do {
fd = open(filename, O_RDONLY | O_NOCTTY);
} while (fd == -1 && errno == EINTR);
if (fd == -1)
return errno;
do {
/* Obtain file statistics. */
if (fstat(fd, &info) == -1) {
cause = errno;
break;
}
/* Count total, rounding up to next multiple of block size. */
total = (uint64_t)info.st_size;
if (total % (uint64_t)info.st_blksize)
total += (uint64_t)info.st_blksize - ((uint64_t)total % (uint64_t)info.st_blksize);
/* Count total stored bytes. */
stored = (uint64_t)512 * (uint64_t)info.st_blocks;
/* Fill in immediately known fields. */
if (sizeptr)
*sizeptr = (uint64_t)info.st_size;
if (blocksizeptr)
*blocksizeptr = (uint64_t)info.st_blksize;
if (storedptr)
*storedptr = stored;
if (sparseptr) {
if (total > stored)
*sparseptr = total - stored;
else
*sparseptr = 0;
}
if (zeroedptr)
*zeroedptr = 0;
/* Verify we have a regular file. */
if (S_ISDIR(info.st_mode)) {
cause = EISDIR;
break;
} else
if (!S_ISREG(info.st_mode)) {
cause = EISNAM;
break;
}
/* Verify we have a valid block size. */
if (info.st_blksize < (blksize_t)1) {
cause = ENOTSUP;
break;
}
/* If zeroedptr is NULL, we do not need to read the file. */
if (!zeroedptr) {
/* Close descriptor and return success. */
do {
result = close(fd);
} while (result == -1 && errno == EINTR);
if (result == -1)
return errno;
return 0;
}
/* Use large enough chunks for I/O. */
if (info.st_blksize < (blksize_t)131072) {
const size_t chunks = (size_t)131072 / (size_t)info.st_blksize;
size = chunks * (size_t)info.st_blksize;
} else
size = (size_t)info.st_blksize;
/* Allocate buffer. */
data = malloc(size);
if (!data) {
cause = ENOMEM;
break;
}
/* Clear counters. */
total = 0;
nonzero = 0;
have = 0;
/* Read loop. */
while (1) {
size_t i;
ssize_t bytes;
int ended = 0;
while (have < (size_t)info.st_blksize) {
bytes = read(fd, data + have, size - have);
if (bytes > (ssize_t)0) {
have += bytes;
total += (uint64_t)bytes;
} else
if (bytes == (ssize_t)0) {
/* Clear the end of the buffer; just to be sure */
memset(data + have, 0, size - have);
ended = 1;
break;
} else
if (bytes != (ssize_t)-1) {
cause = EIO;
break;
} else
if (errno != EINTR) {
cause = errno;
break;
}
}
if (cause)
break;
/* Count number of zero/nonzero chunks in buffer, but add up as bytes. */
i = have / (size_t)info.st_blksize;
while (i-->0)
if (!is_zero(data + i * (size_t)info.st_blksize, (size_t)info.st_blksize))
nonzero += (uint64_t)info.st_blksize;
/* Followed by a partial chunk? */
{ const size_t overlap = have % (size_t)info.st_blksize;
if (overlap) {
if (have > overlap)
memcpy(data, data + have - overlap, overlap);
have = overlap;
} else
have = 0;
}
/* Next round of the loop, unless end of input. */
if (!ended)
continue;
/* Entire file has been processed. */
/* Partial chunk in buffer? */
if (have) {
if (!is_zero(data, have))
nonzero += (uint64_t)info.st_blksize;
}
/* If file size changed, update statistics. */
if (total != (uint64_t)info.st_size) {
if (fstat(fd, &info) == -1) {
cause = errno;
break;
}
/* File changed from under us? */
if (total != (uint64_t)info.st_size) {
cause = EBUSY;
break;
}
}
/* Align total size to (next) multiple of block size. */
if (total % (uint64_t)info.st_blksize)
total += (uint64_t)info.st_blksize - (total % (uint64_t)info.st_blksize);
/* Bytes stored on disk. */
stored = (uint64_t)512 * (uint64_t)info.st_blocks;
/* Sanity check. (File changed while we read it?) */
if (stored > total || nonzero > stored) {
cause = EBUSY;
break;
}
/* Update fields. */
if (sizeptr)
*sizeptr = (uint64_t)info.st_size;
if (storedptr)
*storedptr = (uint64_t)512 * (uint64_t)info.st_blocks;
if (sparseptr)
*sparseptr = total - stored;
if (zeroedptr)
*zeroedptr = (total - nonzero) - (total - stored);
/* Discard buffer. */
free(data);
/* Close file and return. */
do {
result = close(fd);
} while (result == -1 && errno == EINTR);
if (result == -1)
return errno;
return 0;
}
} while (0);
/* Free buffer, if allocated. free(NULL) is safe. */
free(data);
/* Close file, and return with cause. */
do {
result = close(fd);
} while (result == -1 && errno == EINTR);
return errno = cause;
}
为了可移植性,所有返回参数都是 64 位无符号整数,并以字节为单位指定相应的大小。请注意,它(*storedptr)+(*sparseptr)
定义了向上舍入到下一个倍数的字节总数(*blocksizeptr)
。(*zeroesptr)
仅包括显式存储的零,不包括稀疏孔。再次,将其(*zeroesptr)
视为不必要存储的零的数量。
我曾经rm -f test ; dd if=/dev/zero of=test bs=10000 seek=3 count=1
生成一个test
有 30,000 字节孔后跟 10,000 个零的文件。examine()
返回size=40000
, blocksize=4096
, stored=12288
, sparse=28672
, zeroed=12288
,这对我来说似乎是正确的。
问题?