我正在使用 NLPP 的一些语料库材料。我正在努力提高我在代码中的解读分数......目前我达到了 91.250%。
练习的重点是更改represent_word 函数以提高分数。该函数消耗一个单词一个字符串,这个单词要么是加扰的,要么是未加扰的。该函数生成单词的“表示”,它是一个包含以下信息的列表:
- 字长
- 元音数
- 辅音数
- 单词的第一个和最后一个字母(这些总是未加扰的)
- 语料库中最常用单词的元组,其字符也是给定单词输入的成员。
我也尝试分析前缀和后缀的字谜,但它们对带有常见字符元组的最常见单词的阴影没有任何贡献。
我不知道为什么我不能提高分数。我什至尝试通过从另一个语料库导入单词来增加字典大小。
此处唯一可以更改的部分是代表字函数及其上方的定义。但是,我将整个来源包括在内,以防它可能会为某些人提供一些有见地的信息。
import nltk
import re
def word_counts(corpus, wordcounts = {}):
""" Function that counts all the words in the corpus."""
for word in corpus:
wordcounts.setdefault(word.lower(), 0)
wordcounts[word.lower()] += 1
return wordcounts
JA_list = filter(lambda x: x.isalpha(), map(lambda x:x.lower(),
nltk.corpus.gutenberg.words('austen-persuasion.txt')))
JA_freqdist=nltk.FreqDist(JA_list)
JA_toplist=sorted(JA_freqdist.items(),key=lambda x: x[1], reverse=True)[:0]
JA_topwords=[]
for i in JA_toplist:
JA_topwords.append(i[0])
PP_list = filter(lambda x: x.isalpha(),map(lambda x:x.lower(),
open("Pride and Prejudice.txt").read().split()))
PP_freqdist=nltk.FreqDist(PP_list)
PP_toplist=sorted(PP_freqdist.items(),key=lambda x: x[1], reverse=True)[:7]
PP_topwords=[]
for i in PP_toplist:
PP_topwords.append(i[0])
uniquewords=[]
for i in JA_topwords:
if i not in PP_topwords:
uniquewords.append(i)
else:
continue
uniquewords.extend(PP_topwords)
def represent_word(word):
def common_word(word):
dictionary= uniquewords
findings=[]
for string in dictionary:
if all((letter in word) for letter in string):
findings.append(string)
else:
False
if not findings:
return None
else:
return tuple(findings)
vowels = list("aeiouy")
consonants = list("bcdfghjklmnpqrstvexz")
number_of_consonants = sum(word.count(i) for i in consonants)
number_of_vowels = sum(word.count(i) for i in vowels)
split_word=list(word)
common_words=common_word(word)
return tuple([split_word[0],split_word[-1], len(split_word),number_of_consonants, number_of_vowels, common_words])
def create_mapping(words, mapping = {}):
""" Returns a mapping of representations of words to the most common word for that representation. """
for word in words:
representation = represent_word(word)
mapping.setdefault(representation, ("", 0))
if mapping[representation][1] < words[word]:
mapping[representation] = (word, words[word])
return mapping
if __name__ == '__main__':
# Create a mapping of representations of the words in Persuasian by Jane Austen to use as a corpus
words = JA_freqdist
mapping = create_mapping(words)
# Load the words in the scrambled file
with open("Pdrie and Puicejdre.txt") as scrambled_file:
scrambled_lines = [line.split() for line in scrambled_file if len(line.strip()) > 0 ]
scrambled_words = [word.lower() for line in scrambled_lines for word in line]
# Descramble the words using the best mapping
descrambled_words = []
for scrambled_word in scrambled_words:
representation = represent_word(scrambled_word)
if representation in mapping:
descrambled_word = mapping[representation][0]
else:
descrambled_word = scrambled_word
descrambled_words.append(descrambled_word)
# Load the original words
with open("Pride and Prejudice.txt") as original_file:
original_lines = [line.split() for line in original_file if len(line.strip()) > 0 ]
original_words = [word.lower() for line in original_lines for word in line]
# Make a list of word pairs from descrambled_words and original words
word_pairs = zip(descrambled_words, original_words)
# See if the words are the same
judgements = [descrambled_word == original_word for (descrambled_word, original_word) in word_pairs]
# Print the results
print "Correct: {0:.3%}".format(float(judgements.count(True))/len(judgements))