我已经在 410k 行的大型数据集上实现了朴素贝叶斯算法。现在我的所有记录都得到了正确分类,但问题是程序花了将近一个小时将记录写入相应的文件。改进的最佳方法是什么我的代码的性能。这是下面的代码。这段代码正在将 410k 记录写入相应的文件。谢谢。
fp=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
token = strtok(line,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token1 = strtok(token,"\n");
memcpy(mystr,&token1[0],strlen(token1)-1);
mystr[strlen(token1)-1] = '\0';
if( strcmp(mystr,"ok") == 0 )
counter_ok++;
else
counter_fraud++;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
prblty_ok = counter_ok/(counter_ok+counter_fraud);
prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fclose(fp);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","a");
fp3=fopen("sales_unknown_fraud_classified.txt","a");
while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
unknwn_attr1 = strtok(line1,",");
unknwn_attr2 = strtok(NULL,",");
unknwn_attr3 = strtok(NULL,",");
unknwn_attr4 = strtok(NULL,",");
unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
fp1=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
{
ok_fraud_attr1 = strtok(line,",");
ok_fraud_attr2 = strtok(NULL,",");
ok_fraud_attr3 = strtok(NULL,",");
ok_fraud_attr4 = strtok(NULL,",");
ok_fraud_attr5 = strtok(NULL,",");
ok_fraud_attr6 = strtok(NULL,",");
memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
//ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
//printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));
if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_ok_attr5++;
}
if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_fraud_attr5++;
}
}
fclose(fp1);
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if(counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
fclose(fp);
fclose(fp2);
fclose(fp3);