我的任务是创建一个词频分析程序,该程序从文本文件中读取内容,并生成以下示例输出:
SUMMARY:
27340 words
2572 unique words
WORD FREQUENCIES (TOP 10):
the 1644
and 872
to 729
a 632
it 595
she 553
i 545
of 514
said 462
you 411
我试图创建一个程序来实现这样的输出。我对 C 编程很陌生,所以虽然它在一定程度上有效,但可能存在很多效率问题/缺陷。这是我到目前为止写的:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAX_WORD 32
#define MAX_TEXT_LENGTH 10000
// ===========================================
// STRUCTURE
//============================================
typedef struct word {
char *str; /* Stores the word */
int freq; /* Stores the frequency */
struct word *pNext; /* Pointer to the next word counter in the list */
} Word;
// ===========================================
// FUNCTION PROTOTYPES
//============================================
int getNextWord(FILE *fp, char *buf, int bufsize); /* Given function to get words */
void addWord(char *pWord); /* Adds a word to the list or updates exisiting word */
void show(Word *pWordcounter); /* Outputs a word and its count of occurrences */
Word* createWordCounter(char *word); /* Creates a new WordCounter structure */
// ===========================================
// GLOBAL VARIABLES
//============================================
Word *pStart = NULL; /* Pointer to first word counter in the list */
int totalcount = 0; /* Total amount of words */
int uniquecount = 0; /* Amount of unique words */
// ===========================================
// MAIN
//============================================
int main () {
/* File pointer */
FILE * fp;
/* Read text from here */
fp = fopen("./test.txt","r");
/* buf to hold the words */
char buf[MAX_WORD];
/* Size */
int size = MAX_TEXT_LENGTH;
/* Pointer to Word counter */
Word *pCounter = NULL;
/* Read all words from text file */
while (getNextWord(fp, buf, size)) {
/* Add the word to the list */
addWord(buf);
/* Increment the total words counter */
totalcount++;
}
/* Loop through list and figure out the number of unique words */
pCounter = pStart;
while(pCounter != NULL)
{
uniquecount++;
pCounter = pCounter->pNext;
}
/* Print Summary */
printf("\nSUMMARY:\n\n");
printf(" %d words\n", totalcount); /* Print total words */
printf(" %d unique words\n", uniquecount); /* Print unique words */
/* List the words and their counts */
pCounter = pStart;
while(pCounter != NULL)
{
show(pCounter);
pCounter = pCounter->pNext;
}
printf("\n");
/* Free the allocated memory*/
pCounter = pStart;
while(pCounter != NULL)
{
free(pCounter->str);
pStart = pCounter;
pCounter = pCounter->pNext;
free(pStart);
}
/* Close file */
fclose(fp);
return 0;
}
// ===========================================
// FUNCTIONS
//============================================
void show(Word *pWordcounter)
{
/* output the word and it's count */
printf("\n%-30s %5d", pWordcounter->str,pWordcounter->freq);
}
void addWord(char *word)
{
Word *pCounter = NULL;
Word *pLast = NULL;
if(pStart == NULL)
{
pStart = createWordCounter(word);
return;
}
/* If the word is in the list, increment its count */
pCounter = pStart;
while(pCounter != NULL)
{
if(strcmp(word, pCounter->str) == 0)
{
++pCounter->freq;
return;
}
pLast = pCounter;
pCounter = pCounter->pNext;
}
/* Word is not in the list, add it */
pLast->pNext = createWordCounter(word);
}
Word* createWordCounter(char *word)
{
Word *pCounter = NULL;
pCounter = (Word*)malloc(sizeof(Word));
pCounter->str = (char*)malloc(strlen(word)+1);
strcpy(pCounter->str, word);
pCounter->freq = 1;
pCounter->pNext = NULL;
return pCounter;
}
int getNextWord(FILE *fp, char *buf, int bufsize) {
char *p = buf;
char c;
//skip all non-word characters
do {
c = fgetc(fp);
if (c == EOF)
return 0;
} while (!isalpha(c));
//read word chars
do {
if (p - buf < bufsize - 1)
*p++ = tolower(c);
c = fgetc(fp);
} while (isalpha(c));
//finalize word
*p = '\0';
return 1;
}
它正确显示摘要。字数和独特字数完全正确。然后它会列出在文件中找到的每个唯一单词并显示正确的出现次数。
我现在需要做的(以及我遇到很多麻烦的事情)是按出现次数以降序对我的链表进行排序。最重要的是,它应该只显示前 10 个单词而不是全部(一旦我对链表进行了排序,这应该是可行的)。
我知道代码本身现在效率很低,但我现在主要关心的是获得正确的输出。
如果有人可以通过排序算法帮助我,或者至少为我指出正确的方向,我将不胜感激。
谢谢你。