c++ - 计算文件的熵

Question

我已经尝试从谷歌、论坛、维基百科和很多很多论坛搜索这个功能超过两个小时，但我找不到它。我怎么能做到这一点？我尝试了以下但没有奏效。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdint.h>

static unsigned int mylog2 (unsigned int val) {
 unsigned int ret = -1;
 while (val != 0) {
    val >>= 1;
    ret++;
 }
 return ret;
}

int main(int argc, char **argv)
{
FILE            *pFile;
int             i;              // various loop index
int             j;              // filename loop index
int             n;              // Bytes read by fread;
int             size;           // Filesize
float           entropy;
float           temp;           // temp value used in entropy calculation
long            alphabet[256];
unsigned char   buffer[1024];


/* do this for all files */
for(j = 1; j < argc; j++)
{
    /* initialize all values */
    size = 0;
    entropy = 0.0;
    memset(alphabet, 0, sizeof(long) * 256);

    pFile = fopen(argv[j], "rb");
    if(pFile == NULL)
    {
        printf("Failed to open `%s`\n", argv[j]);
        continue;
    }

    /* Read the whole file in parts of 1024 */
    while((n = fread(buffer, 1, 1024, pFile)) != 0)
    {
        /* Add the buffer to the alphabet */
        for (i = 0; i < n; i++)
        {
            alphabet[(int) buffer[i]]++;
            size++;
        }
    }
    fclose(pFile);

    /* entropy calculation */
    for (i = 0; i < 256; i++)
    {
        if (alphabet[i] != 0)
        {
            temp = (float) alphabet[i] / (float) size;
            entropy += -temp * mylog2(temp);
        }
    }
    printf("%02.5f [ %02.5f ]\t%s\n", entropy, entropy / 8, argv[j]);
 } // outer for 
 return 0;
}

我知道我做错了。在python中它似乎要容易得多，在python中它是：

import sys
import math

if len(sys.argv) != 2:
    print "Usage: file_entropy.py [path]filename"
    sys.exit()

# read the whole file into a byte array
f = open(sys.argv[1], "rb")
byteArr = map(ord, f.read())
f.close()
fileSize = len(byteArr)
print 'File size in bytes:'
print fileSize
print

# calculate the frequency of each byte value in the file
freqList = []
for b in range(256):
    ctr = 0
    for byte in byteArr:
        if byte == b:
            ctr += 1
    freqList.append(float(ctr) / fileSize)
# print 'Frequencies of each byte-character:'
# print freqList
# print

# Shannon entropy
ent = 0.0
for freq in freqList:
    if freq > 0:
        ent = ent + freq * math.log(freq, 2)
ent = -ent
print 'Shannon entropy (min bits per byte-character):'
print ent
print
print 'Min possible file size assuming max theoretical compression efficiency:'
print (ent * fileSize), 'in bits'
print (ent * fileSize) / 8, 'in bytes'

###  Modifications to file_entropy.py to create the Histogram start here ###
### by Ken Hartman  www.KennethGHartman.com

import numpy as np
import matplotlib.pyplot as plt

N = len(freqList)

ind = np.arange(N)  # the x locations for the groups
width = 1.00        # the width of the bars

#fig = plt.figure()
fig = plt.figure(figsize=(11,5),dpi=100)
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, freqList, width)
ax.set_autoscalex_on(False)
ax.set_xlim([0,255])

ax.set_ylabel('Frequency')
ax.set_xlabel('Byte')
ax.set_title('Frequency of Bytes 0 to 255\nFILENAME: ' + sys.argv[1])

plt.show()

如何在 C++ 中实现相同的功能？希望有人如实回答。

score 0 · Accepted Answer

您不能以 2 为底计算对数的整数部分。要在 C 中以 2 为底计算对数，您可以使用log2from math.h。

score 0 · Accepted Answer

香农熵是H= -1*sum(p_i*log(p_i))其中p_i是每个符号 i 的频率（总和），如果对数基数为 2 ，则结果以每个符号的位数为单位，如果对数基数为 n ，则为“nats” 。但是如果你改变你表达数据的方式，它会改变，即如果相同的数据表示为位、字节等。所以你可以除以 log(n)，其中n 是可用符号的数量（2 表示二进制，256 表示字节）和 H 的范围从 0 到 1（这是归一化的密集香农熵）。

上述熵是一种“密集”形式，即类似于物理中的特定熵的每符号，每公斤或每摩尔。像物理熵这样的常规“广泛”熵是S=N*H其中 N 是文件中符号的数量。对上面的 H 进行一些数学运算，给出了文件的归一化扩展熵，其中“n”是不同的“i”符号的数量（二进制为 2，字节为 256）：

S=N * H / log(n) = sum(count_i*log(N/count_i))/log(n)

对于每个符号频率相同的文件，这给出S=N. 熵不对数据进行任何压缩，因此完全不知道任何模式，因此 000000111111 具有与 010111101000 相同的 H 和 S（两种情况下都有 6 个 1 和 6 个 0）。

c++ - 计算文件的熵

2 回答 2

Related

Reference