这段代码检查了一个 5.1GB 的大文本文件,并检查是否有出现少于 100 次的单词。然后将 5.1GB 重写为输出文本文件并用 unk 替换这些单词。主要问题是 output.txt 的创建需要很长时间。我怀疑方法 write_text() 通过打开数据集文件和输出文件的方式引起了问题。
这个脚本背后的目标:我有一个预建的词汇,我有一个文本。文本可能有我的词汇中没有的新词,所以我想将它们添加到我的词汇中。但我只想添加相关的新词(出现超过 100 次)。文本中出现少于 100 次的新词是一次性的,并不重要,所以我想将它们更改为“unk”。
from collections import Counter
extra_words = []
new_words = []
add_words = []
def get_vocab():
vocab = set()
with open('vocab.txt', 'r', encoding='utf-8') as rd:
lines = rd.readlines()
for line in lines:
tokens = line.split(' ')
word = tokens[0]
vocab.add(word)
return vocab
def _count(text):
vocab = get_vocab()
with open(text, 'r', encoding='utf-8') as fd:
for line in fd.readlines():
for token in line.split():
if token not in vocab:
extra_words.append(token)
word_count = Counter(extra_words)
# add del word_count[punctuation] to remove it from list
#del word_count['"']
for word in word_count:
if word_count[word] < 100:
new_words.append(word)
else:
add_words.append(word)
write_text()
#return len(new_words), word_count.most_common()[0]
def write_text():
with open('dataset', 'r', encoding='utf-8') as fd:
f = fd.readlines()
with open('output.txt', 'w', encoding='utf-8') as rd:
new_text = []
for line in f:
new_line = []
for token in line.split():
if token in new_words:
new_line.append('<unk>')
else:
new_line.append(token)
new_text.append(' '.join(new_line))
print('\n'.join(new_text), file=rd)
#print(' '.join(new_line), file=rd)
def add_vocab():
ln = len(get_vocab())
with open('vocab.txt', 'w', encoding='utf-8') as fd:
for idx, word in add_words:
print(f'{word} {ln + idx + 1}\n', file=fd)
pass
print(_count('dataset'))
add_vocab()