我正在尝试使用 gensim 训练 Doc2Vec 模型,该模型具有 114M 唯一文档/标签和大约 3M 唯一单词的词汇大小。我在 Azure 上有 115GB Ram linux 机器。当我运行 build_vocab 时,迭代器会解析所有文件,然后抛出如下所示的内存错误。
Traceback (most recent call last):
File "doc_2_vec.py", line 63, in <module>
model.build_vocab(sentences.to_array())
File "/home/meghana/.local/lib/python2.7/site-packages/gensim/models/word2vec.py", line 579, in build_vocab
self.finalize_vocab(update=update) # build tables & arrays
File "/home/meghana/.local/lib/python2.7/site-packages/gensim/models/word2vec.py", line 752, in finalize_vocab
self.reset_weights()
File "/home/meghana/.local/lib/python2.7/site-packages/gensim/models/doc2vec.py", line 662, in reset_weights
self.docvecs.reset_weights(self)
File "/home/meghana/.local/lib/python2.7/site-packages/gensim/models/doc2vec.py", line 390, in reset_weights
self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
MemoryError
我的代码-
import parquet
import json
import collections
import multiprocessing
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
class LabeledLineSentence(object):
def __init__(self, sources):
self.sources = sources
flipped = {}
def __iter__(self):
for src in self.sources:
with open(src) as fo:
for row in parquet.DictReader(fo, columns=['Id','tokens']):
yield LabeledSentence(utils.to_unicode(row['tokens']).split('\x01'), [row['Id']])
## list of files to be open ##
sources = glob.glob("/data/meghana_home/data/*")
sentences = LabeledLineSentence(sources)
#pre = Doc2Vec(min_count=0)
#pre.scan_vocab(sentences)
"""
for num in range(0, 20):
print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)
print("done")
"""
NUM_WORKERS = multiprocessing.cpu_count()
NUM_VECTORS = 300
model = Doc2Vec(alpha=0.025, min_alpha=0.0001,min_count=15, window=3, size=NUM_VECTORS, sample=1e-4, negative=10, workers=NUM_WORKERS)
model.build_vocab(sentences)
print("built vocab.......")
model.train(sentences,total_examples=model.corpus_count, epochs=10)
根据顶部的内存使用情况是-
有人可以告诉我预期的内存是多少吗?什么是更好的选择 - 添加交换空间并减慢进程或添加更多内存,以便集群的成本最终可能相等。gensim 将哪些向量存储在内存中?我为内存有效使用而缺少的任何标志。