2

我正在使用 NLPP 的一些语料库材料。我正在努力提高我在代码中的解读分数......目前我达到了 91.250%。

练习的重点是更改represent_word 函数以提高分数。该函数消耗一个单词一个字符串,这个单词要么是加扰的,要么是未加扰的。该函数生成单词的“表示”,它是一个包含以下信息的列表:

  • 字长
  • 元音数
  • 辅音数
  • 单词的第一个和最后一个字母(这些总是未加扰的)
  • 语料库中最常用单词的元组,其字符也是给定单词输入的成员。

我也尝试分析前缀和后缀的字谜,但它们对带有常见字符元组的最常见单词的阴影没有任何贡献。

我不知道为什么我不能提高分数。我什至尝试通过从另一个语料库导入单词来增加字典大小。

此处唯一可以更改的部分是代表字函数及其上方的定义。但是,我将整个来源包括在内,以防它可能会为某些人提供一些有见地的信息。

    import nltk
    import re

    def word_counts(corpus, wordcounts = {}):
    """ Function that counts all the words in the corpus."""
    for word in corpus:
        wordcounts.setdefault(word.lower(), 0)
        wordcounts[word.lower()] += 1
    return wordcounts

JA_list = filter(lambda x: x.isalpha(), map(lambda x:x.lower(), 
                        nltk.corpus.gutenberg.words('austen-persuasion.txt')))
JA_freqdist=nltk.FreqDist(JA_list)
JA_toplist=sorted(JA_freqdist.items(),key=lambda x: x[1], reverse=True)[:0]
JA_topwords=[]
for i in JA_toplist:
    JA_topwords.append(i[0])

PP_list = filter(lambda x: x.isalpha(),map(lambda x:x.lower(), 
                            open("Pride and Prejudice.txt").read().split()))
PP_freqdist=nltk.FreqDist(PP_list)
PP_toplist=sorted(PP_freqdist.items(),key=lambda x: x[1], reverse=True)[:7]
PP_topwords=[]
for i in PP_toplist:
    PP_topwords.append(i[0])

uniquewords=[]
for i in JA_topwords:
    if i not in PP_topwords:
        uniquewords.append(i)
    else:
        continue
uniquewords.extend(PP_topwords)

def represent_word(word):
    def common_word(word):
        dictionary= uniquewords 
        findings=[]
        for string in dictionary:
            if all((letter in word) for letter in string):
                findings.append(string)
            else:
                False
        if not findings:
            return None
        else:
            return tuple(findings)      
    vowels = list("aeiouy") 
    consonants = list("bcdfghjklmnpqrstvexz") 
    number_of_consonants = sum(word.count(i) for i in consonants)
    number_of_vowels = sum(word.count(i) for i in vowels)
    split_word=list(word)
    common_words=common_word(word)
    return tuple([split_word[0],split_word[-1], len(split_word),number_of_consonants, number_of_vowels, common_words])





def create_mapping(words, mapping = {}):
    """ Returns a mapping of representations of words to the most common word for that representation. """
    for word in words:
        representation = represent_word(word)
        mapping.setdefault(representation, ("", 0))
        if mapping[representation][1] < words[word]:
            mapping[representation] = (word, words[word])
    return mapping

if __name__ == '__main__':
    # Create a mapping of representations of the words in Persuasian by Jane Austen to use as a corpus
    words = JA_freqdist
    mapping = create_mapping(words)

    # Load the words in the scrambled file
    with open("Pdrie and Puicejdre.txt") as scrambled_file:
        scrambled_lines = [line.split() for line in scrambled_file if len(line.strip()) > 0 ]
        scrambled_words = [word.lower() for line in scrambled_lines for word in line]

    # Descramble the words using the best mapping 
    descrambled_words = []
    for scrambled_word in scrambled_words:
        representation = represent_word(scrambled_word)
        if representation in mapping:
            descrambled_word = mapping[representation][0]
        else:
            descrambled_word = scrambled_word
        descrambled_words.append(descrambled_word)

    # Load the original words
    with open("Pride and Prejudice.txt") as original_file:
        original_lines = [line.split() for line in original_file if len(line.strip()) > 0 ]
        original_words = [word.lower() for line in original_lines for word in line]

    # Make a list of word pairs from descrambled_words and original words
    word_pairs = zip(descrambled_words, original_words)
    # See if the words are the same
    judgements = [descrambled_word == original_word for (descrambled_word, original_word) in word_pairs]
    # Print the results
    print "Correct: {0:.3%}".format(float(judgements.count(True))/len(judgements))
4

0 回答 0