0

我想读取一个包含一些字符的文件并检查可打印字符的百分比以及空格的百分比。这是我生成输入文件的 Python 代码:

import string
import random
array = list()
array = list(string.printable)
print(array)
external = ['\0','\a','\b','\v','\f','\e']
array = array + external
file = open("in.txt" , 'w')
for i in range (1000):
        outputline = array[random.randrange(0,len(array)-1)]
        file.write(outputline)
file.close()

我希望我的文件同时具有可打印字符和空格以及其他字符(不在这两组中)。我通过两种方式做到这一点:

  1. read使用C 中的系统调用逐块读取文件:

    #include <stdio.h>
    #include <stdio.h>
    #include <ctype.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <errno.h>
    int main(int arg, char *argv[])
    {
        char c ;
        char *data;
        int numOfWs = 0 ;
        int numOfPr = 0 ;
        int numberOfCharacters;
        int sizeOfBlock ;
        int nread ;
        int i=0;
        int k;
        float wsP = 0;
        float prP = 0;
        sizeOfBlock = atoi(argv[3]) ;
        data = malloc(sizeOfBlock*sizeof(char));
        int fd = open(argv[2], O_RDONLY);
        while((nread = read(fd, data, sizeOfBlock)) > 0)
        {
            numberOfCharacters += nread ;
            for (i = 0; i < nread; ++i)
            {
                c = data[i] ;
                if(isprint(c))
                    numOfPr ++ ;
                else if(isspace (c))
                    numOfWs ++ ;
            }
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        exit(0);
    }
    
  2. 将整个文件复制到内存中mmap(),然后开始从内存中读取它:

    #include <stdio.h>
    #include <stdlib.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/types.h>
    #include <sys/mman.h>
    #include <sys/stat.h>
    #include <errno.h>
    #include <ctype.h>
    #include <string.h>
    int main(int arg, char *argv[])
    {
        char c ;
        int i, numOfWs = 0, numOfPr = 0, numberOfCharacters = 0;
        char *data;
        float wsP = 0;
        float prP = 0;
        struct stat s;
        int fp = open("x.txt", O_RDWR);
        int status = fstat (fp, &s);
        int size = s.st_size;
        data = mmap((caddr_t)0, size, PROT_READ, MAP_SHARED, fp,0);
        for (i=0; i<size; i++){
            char c;
            c = data[i];
            if(isspace(c))
                numOfWs ++;
            else if(isprint(c))
                numOfPr ++;
            numberOfCharacters ++ ;
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        close(fp);
        exit(0);
    }
    

当我用同一个文件(由 Python 代码创建)检查这两种方式时,我得到了两个不同的百分比答案,但每种方式读取的字符数是相同的(1000)。

这是我从 Python 代码生成的文件(我不知道当我在这里复制它时会发生什么,它包含很多控制字符):

el*2mlz_XyjP@%?Sw}Qo~.."tJ^~6,eN8+kq l)*N-1oupE
)coFKoA0\=|W'{Oezx~^p(B5ZJe!AdYb{Gflv&wwCf8}>3"v*>9\pW8PIs;qpX7RSk<9}&8B$u kNaq(mJK$N-!38?E%8-T,I1zC~0O=}FH*
x9x6Q%GT_C0j>7:@EG{N ?Eh$?18;Ncy[3 $'ikKs%:A].?e;i4`x"k!VD]}*pw
?\wE~Vix7^H~[26lsN?_GO$vz3M464S`+h=A(5@]q<&<+ hjmehAb-_3*3F8&#iM3p)6T`S9Q\yZwm$U`OHG}02{A)WcVzBR1h}H?qhF:P^-j5AQ1<7FD60j#B#}9Z=}2QReaYy|{Wv<^!yOC/7P}n*ZEPV2@8cU),=*5]]d a:3J;Y(?D?31$pcrquc#&PB;A[9lV+gJ%WZ6K~A|%^E_\3dM/?"y)BtUtG"3hf}W4,3DrXxTyl\UbWwCbMufqCNWx |hiJ\>43S6tCCS)rEo0.cz5PjgK0_AKN|8'g]byLp9AlrZDuK1OX,Csa}nu&i_p,#
Wyc{Q
LA\4:!WSq"ln|Pv.B;+N'h%O;tu(CgIh~OYIXCl+6~nSxBuybP nH:j;t'\vk&p}
,;3Ny#`Ug!rVbqExY|  %BVCD^D~z:L(j8L!    @   X4a!KBNCQ4z&3^9[O<fkM-qrOq5F/M*]yyU+-VLdZRtUu
a"=a b%c~GI|tcC/
P'/`t|hZ/2iHd94l"%;4-{)VUw%%3"e%IQ{RAX]NeMcsh&@LziT0)_T"2XADH&NYqa<6,$wdSp@LIMGA&,Gx1mj|t't?7=YtT77r<qi8;|tzi kOAi'dq%+g2   5hY?XTj{F)18.Vd!!$Q{D)}$7XxO)Vi%29*,P"cXkC,M|&brd&-DGF>V4 %N)a"VM+TQ$FI;YiL-0 YSxXgC@i~,o6/a7U2c"eGr\N7^B:'dytlOOS(iy\lhC7vnW,f o;vKUNa
Hg#u}W4N wUM

这是结果mmap()

961 printable characters out of 1000 bytes, 96.10%
39 whitespace characters out of 1000 bytes, 3.90%

这是结果chunk reading

974 printable characters out of 1000 bytes, 97.40%
26 whitespace characters out of 1000 bytes, 2.60%

为什么两个方法中的数量printable characters不同,但是文件是一样的呢?

我认为在分块模式下isspace()不接受空格,而是将其视为printable character.

4

2 回答 2

1

您没有numberOfCharacters0第一个程序中初始化。

这意味着numberOfCharacters之前的值是未确定的

numberOfCharacters += nread ;

被执行,这是将声明与定义分开的一个很好的理由。

于 2015-06-28T15:50:08.737 回答
1

问题是空格是可打印的,isprint()我改变了它的顺序isspace()isprint()使用if()...else if()它变成了真的。

于 2015-06-28T19:15:48.443 回答