1

这段代码检查了一个 5.1GB 的大文本文件,并检查是否有出现少于 100 次的单词。然后将 5.1GB 重写为输出文本文件并用 unk 替换这些单词。主要问题是 output.txt 的创建需要很长时间。我怀疑方法 write_text() 通过打开数据集文件和输出文件的方式引起了问题。

这个脚本背后的目标:我有一个预建的词汇,我有一个文本。文本可能有我的词汇中没有的新词,所以我想将它们添加到我的词汇中。但我只想添加相关的新词(出现超过 100 次)。文本中出现少于 100 次的新词是一次性的,并不重要,所以我想将它们更改为“unk”。


from collections import Counter

extra_words = []
new_words = []
add_words = []


def get_vocab():
    vocab = set()
    with open('vocab.txt', 'r', encoding='utf-8') as rd:
        lines = rd.readlines()

    for line in lines:
        tokens = line.split(' ')
        word = tokens[0]
        vocab.add(word)

    return vocab


def _count(text):

    vocab = get_vocab()

    with open(text, 'r', encoding='utf-8') as fd:

        for line in fd.readlines():

            for token in line.split():

                if token not in vocab:
                    extra_words.append(token)

    word_count = Counter(extra_words)

    # add del word_count[punctuation] to remove it from list

    #del word_count['"']

    for word in word_count:

        if word_count[word] < 100:
            new_words.append(word)

        else:
            add_words.append(word)

    write_text()

    #return len(new_words), word_count.most_common()[0]


def write_text():

    with open('dataset', 'r', encoding='utf-8') as fd:

        f = fd.readlines()

    with open('output.txt', 'w', encoding='utf-8') as rd:
        new_text = []
        for line in f:
            new_line = []
            for token in line.split():

                

                if token in new_words:

                    new_line.append('<unk>')

                else:

                    new_line.append(token)

            new_text.append(' '.join(new_line))
        print('\n'.join(new_text), file=rd)
            #print(' '.join(new_line), file=rd)


def add_vocab():

    ln = len(get_vocab())

    with open('vocab.txt', 'w', encoding='utf-8') as fd:

        for idx, word in add_words:

            print(f'{word} {ln + idx + 1}\n', file=fd)

    pass


print(_count('dataset'))
add_vocab()
4

1 回答 1

1

我用莎士比亚的全集对此进行了测试。您还有大量与大小写和标点符号相关的工作要做。它在大约 15 秒内为我复制了 100 份他的作品(500meg)。如果这需要更多不可接受的时间,您可能希望查看分析您的代码。请注意,我使用了您的词汇文件的简化版本,因为我没有遵循您想要在其中看到的内容。我使用的版本只是逐行文字。

import collections

def get_vocabulary(path):
    with open(path, 'r', encoding='utf-8') as file_in:
        tokens = [line.strip("\n") for line in file_in]
    return set(tokens)

def get_interesting_word_counts(path, vocabulary):
    word_counts = collections.Counter()
    with open(path, 'r', encoding='utf-8') as file_in:
        for line in file_in:
            word_counts.update([token for token in line.split() if token not in vocabulary])
    return word_counts

def get_cleaned_text(path, vocabulary, uncommon_words):
    with open(path, 'r', encoding='utf-8') as file_in:
        for line in file_in:
            #line_out = " ".join(["<unk>" if token in uncommon_words else token for token in line.strip("\n").split()])
            line_out = " ".join([
                token if token in vocabulary or token not in uncommon_words else "<unk>"
                for token in line.strip("\n").split()
            ])
            yield "{}\n".format(line_out)

vocabulary = get_vocabulary("vocabulary.txt")
word_counts = get_interesting_word_counts("shakespeare.txt", vocabulary)

## --------------------------------------
## Add frequent but missing words to vocabulary
## --------------------------------------
common_words = set([item[0] for item in word_counts.items() if item[1] >= 100])
with open('vocabulary.txt', 'a', encoding='utf-8') as file_out:
    for word in common_words:
        file_out.write("{}\n".format(word))
## --------------------------------------

## --------------------------------------
## Rewite the text censuring uncommon words
## --------------------------------------
uncommon_words = set([item[0] for item in word_counts.items() if item[1] < 100])
cleaned_text = get_cleaned_text("shakespeare.txt", vocabulary, uncommon_words)
with open('shakespeare_out.txt', 'w', encoding='utf-8') as file_out:
    file_out.writelines(cleaned_text)
## --------------------------------------

你可以得到我在这里使用的文本:http://www.gutenberg.org/ebooks/100

源码开始:

The Project Gutenberg eBook of The Complete Works of William Shakespeare, by William Shakespeare

结果文件开始:

<unk> <unk> <unk> <unk> of The <unk> <unk> of <unk> <unk> by <unk> <unk>

更新的词汇文件开始:

as
run
he’s
this.
there’s
like
you.
于 2021-04-26T16:03:06.330 回答