在搜索和试验不同的软件包并测量每个软件包计算分数所需的时间后,我发现nltk corpus bleu和PyRouge是最有效的。请记住,在每条记录中,我有多个假设,这就是为什么我为每条记录计算一次平均值,这就是我为 BLEU 所做的:
reference = [[i.split() for i in ref]]
def find_my_bleu(text, w):
candidates_ = [text.split()]
return corpus_bleu(reference, candidates_, weights=w,
smoothing_function=cc.method4)
def get_final_bleu(output_df):
print('Started calculating the bleu scores...')
output_df.loc[:, 'bleu_1'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (1, 0, 0, 0)) for t in x])
output_df.loc[:, 'bleu_2'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (0, 1, 0, 0)) for t in x])
output_df.loc[:, 'bleu_3'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (0, 0, 1, 0)) for t in x])
print('Now the average score...')
output_df.loc[:, 'bleu_3_mean'] = output_df.loc[:, 'bleu_3'].apply(lambda x:np.mean(x))
output_df.loc[:, 'bleu_2_mean'] = output_df.loc[:, 'bleu_2'].apply(lambda x:np.mean(x))
output_df.loc[:, 'bleu_1_mean'] = output_df.loc[:, 'bleu_1'].apply(lambda x:np.mean(x))
print('mean bleu_3 score: ', np.mean(output_df.loc[:, 'bleu_3_mean']))
print('mean bleu_2 score: ', np.mean(output_df.loc[:, 'bleu_2_mean']))
print('mean bleu_1 score: ', np.mean(output_df.loc[:, 'bleu_1_mean']))
对于胭脂:
胭脂 = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=False, rouge_s=False, rouge_su=False)
def find_my_rouge(text):
hypotheses = [[text.split()]]
score = rouge.evaluate_tokenized(hypotheses, [[reference_rouge]])
return score
然后取所有的平均值:
def get_short_rouge(list_dicts):
""" get the mean of all generated text for each record"""
l_r = 0
l_p = 0
l_f = 0
one_r = 0
one_p = 0
one_f = 0
two_r = 0
two_p = 0
two_f = 0
for d in list_dicts:
one_r += d['rouge-1']['r']
one_p += d['rouge-1']['p']
one_f += d['rouge-1']['f']
two_r += d['rouge-2']['r']
two_p += d['rouge-2']['p']
two_f += d['rouge-2']['f']
l_r += d['rouge-l']['r']
l_p += d['rouge-l']['p']
l_f += d['rouge-l']['f']
length = len(list_dicts)
return {'rouge-1': {'r': one_r/length , 'p': one_p/length , 'f': one_f/length},
'rouge-2': {'r': two_r/length, 'p': two_p/length, 'f': two_f/length},
'rouge-l': {'r': l_r/length, 'p': l_p/length , 'f': l_f/length}
}
def get_overal_rouge_mean(output_df):
print('Started getting the overall rouge of each record...')
output_df.loc[:, 'rouge_mean'] = output_df.loc[:, 'rouge'].apply(lambda x: get_short_rouge(x))
print('Started getting the overall rouge of all record...')
l_r = 0
l_p = 0
l_f = 0
one_r = 0
one_p = 0
one_f = 0
two_r = 0
two_p = 0
two_f = 0
for i in range(len(output_df)):
d = output_df.loc[i, 'rouge_mean']
one_r += d['rouge-1']['r']
one_p += d['rouge-1']['p']
one_f += d['rouge-1']['f']
two_r += d['rouge-2']['r']
two_p += d['rouge-2']['p']
two_f += d['rouge-2']['f']
l_r += d['rouge-l']['r']
l_p += d['rouge-l']['p']
l_f += d['rouge-l']['f']
length = len(output_df)
print('overall rouge scores: ')
print({'rouge-1': {'r': one_r/length , 'p': one_p/length , 'f': one_f/length},
'rouge-2': {'r': two_r/length, 'p': two_p/length, 'f': two_f/length},
'rouge-l': {'r': l_r/length, 'p': l_p/length , 'f': l_f/length}
})
return output_df
我希望它可以帮助任何遇到这个问题的人。