0

我的任务是创建一个词频分析程序,该程序从文本文件中读取内容,并生成以下示例输出:

SUMMARY:

27340 words
2572 unique words

WORD FREQUENCIES (TOP 10):

the 1644
and  872
to  729
a  632
it  595
she  553
i 545
of  514
said 462
you 411

我试图创建一个程序来实现这样的输出。我对 C 编程很陌生,所以虽然它在一定程度上有效,但可能存在很多效率问题/缺陷。这是我到目前为止写的:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_WORD 32
#define MAX_TEXT_LENGTH 10000

// ===========================================
//                 STRUCTURE
//============================================


typedef struct word {
char *str;              /* Stores the word */
int freq;               /* Stores the frequency */
struct word *pNext;     /* Pointer to the next word counter in the list */
} Word;

// ===========================================
//             FUNCTION PROTOTYPES
//============================================

int getNextWord(FILE *fp, char *buf, int bufsize);   /* Given function to get words */
void addWord(char *pWord);                          /* Adds a word to the list or updates exisiting word */
void show(Word *pWordcounter);        /* Outputs a word and its count of occurrences */
Word* createWordCounter(char *word);  /* Creates a new WordCounter structure */

// ===========================================
//             GLOBAL VARIABLES
//============================================

Word *pStart = NULL;                  /* Pointer to first word counter in the list */

int totalcount = 0;                  /* Total amount of words */
int uniquecount = 0;                /* Amount of unique words */



// ===========================================
//                 MAIN
//============================================      


int main () {

    /* File pointer */
    FILE * fp;
    /* Read text from here */
    fp = fopen("./test.txt","r");

    /* buf to hold the words */
    char buf[MAX_WORD];

    /* Size */
    int size = MAX_TEXT_LENGTH;


    /* Pointer to Word counter */
    Word *pCounter = NULL;


    /* Read all words from text file */

    while (getNextWord(fp, buf, size)) {

        /* Add the word to the list */
        addWord(buf); 

        /* Increment the total words counter */
        totalcount++;
    }


    /* Loop through list and figure out the number of unique words */
    pCounter = pStart;
    while(pCounter != NULL)
    {
        uniquecount++;
        pCounter = pCounter->pNext;
    }

    /* Print Summary */

    printf("\nSUMMARY:\n\n");
    printf("   %d words\n", totalcount); /* Print total words */
    printf("   %d unique words\n", uniquecount); /* Print unique words */




    /* List the words and their counts */
    pCounter = pStart;
    while(pCounter != NULL)
    {
        show(pCounter);
        pCounter = pCounter->pNext;
    }
    printf("\n");


    /* Free the allocated  memory*/
    pCounter = pStart;
    while(pCounter != NULL)
    {
        free(pCounter->str);        
        pStart = pCounter;           
        pCounter = pCounter->pNext;  
        free(pStart);                  
    }

    /* Close file */
    fclose(fp);

    return 0;

}


// ===========================================
//                 FUNCTIONS
//============================================


void show(Word *pWordcounter)
{
  /* output the word and it's count */
  printf("\n%-30s   %5d", pWordcounter->str,pWordcounter->freq);

}

void addWord(char *word)
{
  Word *pCounter = NULL;
  Word *pLast = NULL;

  if(pStart == NULL)
  {
    pStart = createWordCounter(word);
    return;
  }

  /* If the word is in the list, increment its count */
  pCounter = pStart;
  while(pCounter != NULL)
  {
    if(strcmp(word, pCounter->str) == 0)
    {
      ++pCounter->freq;

      return;
    }
    pLast = pCounter;            
    pCounter = pCounter->pNext;  
  }

  /* Word is not in the list, add it */
  pLast->pNext = createWordCounter(word);
}

Word* createWordCounter(char *word)
{
  Word *pCounter = NULL;
  pCounter = (Word*)malloc(sizeof(Word));
  pCounter->str = (char*)malloc(strlen(word)+1);
  strcpy(pCounter->str, word);
  pCounter->freq = 1;
  pCounter->pNext = NULL;
  return pCounter;
}

int getNextWord(FILE *fp, char *buf, int bufsize) {
    char *p = buf;
    char c;


    //skip all non-word characters
    do {
        c = fgetc(fp);
        if (c == EOF) 
            return 0;
        } while (!isalpha(c));

    //read word chars

    do {
        if (p - buf < bufsize - 1)
        *p++ = tolower(c);
        c = fgetc(fp);
        } while (isalpha(c));

        //finalize word
        *p = '\0';
        return 1;
        }

它正确显示摘要。字数和独特字数完全正确。然后它会列出在文件中找到的每个唯一单词并显示正确的出现次数。

我现在需要做的(以及我遇到很多麻烦的事情)是按出现次数以降序对我的链表进行排序。最重要的是,它应该只显示前 10 个单词而不是全部(一旦我对链表进行了排序,这应该是可行的)。

我知道代码本身现在效率很低,但我现在主要关心的是获得正确的输出。

如果有人可以通过排序算法帮助我,或者至少为我指出正确的方向,我将不胜感激。

谢谢你。

4

2 回答 2

3

对于初学者来说,这个想法可能有点野心勃勃,但了解标准库中的函数总是一个好主意。如果您知道您的链表有多大,您可以使用malloc它为包含相同数据的数组分配空间。然后,您可以使用qsort为您对数据进行排序。

函数malloc,并且qsort是标准 C 库中经常使用的成员。

于 2012-08-06T00:47:35.910 回答
2

不要对链表进行排序,这非常低效且容易出错。将相关数据复制到数组中并使用qsort方法。

当你让你的算法更高效时,我建议使用trie.

于 2012-08-06T00:47:58.077 回答